fix

gemma 3n
fix init of the MambaCaches
2025-11-03 03:14:36 +08:00 · 2025-07-09 19:23:29 +02:00 · 2025-07-09 19:10:52 +02:00 · 2025-07-09 18:49:31 +02:00 · 2025-07-09 18:49:31 +02:00 · 2025-07-09 18:49:31 +02:00
1648 changed files with 40288 additions and 88219 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -303,7 +303,7 @@ non_model_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    # networkx==3.3 (after #36957) cause some issues
    # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install .[serving]"],
+    install_steps=["uv venv && uv pip install ."],
    marker="not generate",
    parallelism=6,
 )
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -31,7 +31,7 @@ jobs:
      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -18,7 +18,7 @@ jobs:
      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      job_splits: ${{ steps.set-matrix.outputs.job_splits }}
      split_keys: ${{ steps.set-matrix.outputs.split_keys }}
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -74,7 +74,7 @@ jobs:

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers natten && python3 -m pip install -e .
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

      - name: Update / Install some packages (for Past CI)
        if: ${{ contains(inputs.docker, '-past-') }}
@ -118,42 +118,25 @@ jobs:
          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV

-      - name: checkout to start_commit
-        working-directory: /transformers
-        run: git checkout 8c4ea670dceace8d9b1bac8310bc62146b7134cd
-
-      - name: download and copy the fixed script
-        working-directory: /transformers
-        shell: bash
-        run: apt-get update && apt-get install -y wget curl && rm -rf utils/check_bad_commit.py && curl -O https://raw.githubusercontent.com/huggingface/transformers/refs/heads/temp_get_new_failed_info/check_bad_commit.py && curl -O https://raw.githubusercontent.com/huggingface/transformers/refs/heads/temp_get_new_failed_info/new_failures_2.json && curl -O https://raw.githubusercontent.com/huggingface/transformers/refs/heads/temp_get_new_failed_info/job_links.json && curl -O https://raw.githubusercontent.com/huggingface/transformers/refs/heads/temp_get_new_failed_info/process_bad_commit_report.py && cp check_bad_commit.py utils/check_bad_commit.py && cp process_bad_commit_report.py utils/process_bad_commit_report.py
-
      - name: Run all tests on GPU
        working-directory: /transformers
-        # run: python3 utils/check_bad_commit.py --start_commit 8c4ea670dceace8d9b1bac8310bc62146b7134cd --end_commit 6dfd561d9cd722dfc09f702355518c6d09b9b4e3 --file new_failures.json --output_file new_failures_with_bad_commit.json
-        run: python3 utils/check_bad_commit.py --start_commit 8c4ea670dceace8d9b1bac8310bc62146b7134cd --end_commit 6dfd561d9cd722dfc09f702355518c6d09b9b4e3 --file new_failures_2.json --output_file new_failures_with_bad_commit.json
+        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}

-      - name: "Upload new_failures_with_bad_commit.json"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: new_failures_with_bad_commit
-          path: /transformers/new_failures_with_bad_commit.json
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt

-      - name: Show results
-        working-directory: /transformers
-        run: |
-          ls -la new_failures_with_bad_commit.json
-          cat new_failures_with_bad_commit.json
-
-      - name: Process report
+      - name: Run test
        shell: bash
-        working-directory: /transformers
        run: |
-          python3 utils/process_bad_commit_report.py
+          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"

-      - name: "Upload new_failures_with_bad_commit_grouped_by_authors.json"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: new_failures_with_bad_commit_grouped_by_authors
-          path: /transformers/new_failures_with_bad_commit_grouped_by_authors.json
+          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -36,7 +36,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      test_map: ${{ steps.set-matrix.outputs.test_map }}
@ -136,7 +136,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
@ -362,7 +362,7 @@ jobs:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    env:
      # For the meaning of these environment variables, see the job `Setup`
      CI_BRANCH_PUSH: ${{ github.event.ref }}
--- a/.github/workflows/self-scheduled-amd-mi325-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi325-caller.yml
@ -1,63 +0,0 @@
-name: Self-hosted runner scale set (AMD mi325 scheduled CI caller)
-
-# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
-# For example, 1gpu scale set: amd-mi325-ci-1gpu
-#              2gpu scale set: amd-mi325-ci-2gpu
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
-
-jobs:
-  model-ci:
-    name: Model CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#amd-hf-ci"
-      runner_scale_set: amd-mi325-ci
-      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi325
-      report_repo_id: optimum-amd/transformers_daily_ci
-    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -7,7 +7,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - temp_get_new_failed_info
+      - run_scheduled_ci*
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -50,8 +50,64 @@ jobs:
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-dummy"
+      slack_report_channel: "#transformers-ci-daily-models"
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
+      docker: huggingface/transformers-pytorch-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#transformers-ci-daily-examples"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+    secrets: inherit
+
+  trainer-fsdp-ci:
+    name: Trainer/FSDP CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_trainer_and_fsdp_gpu
+      slack_report_channel: "#transformers-ci-daily-training"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-daily-training"
+      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
+      ci_event: Daily CI
+      working-directory-prefix: /workspace
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+    secrets: inherit
+
+  quantization-ci:
+    name: Quantization CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_quantization_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-quantization"
+      docker: huggingface/transformers-quantization-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -50,12 +50,12 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
@ -121,11 +121,390 @@ jobs:
      docker: ${{ inputs.docker }}
    secrets: inherit

+  run_trainer_and_fsdp_gpu:
+    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        slice_id: [0, 1]
+    uses: ./.github/workflows/model_jobs.yml
+    with:
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      machine_type: ${{ matrix.machine_type }}
+      slice_id: ${{ matrix.slice_id }}
+      runner_map: ${{ needs.setup.outputs.runner_map }}
+      docker: ${{ inputs.docker }}
+      report_name_prefix: run_trainer_and_fsdp_gpu
+    secrets: inherit
+
+  run_pipelines_torch_gpu:
+    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
+    name: PyTorch pipelines
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-pytorch-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
+
+  run_examples_gpu:
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run examples tests on GPU
+        working-directory: /transformers
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports
+
+  run_torch_cuda_extensions_gpu:
+    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
+    name: Torch CUDA extension tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Update / Install some packages (for Past CI)
+        if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        run: |
+          python3 -m pip install -U datasets
+          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+      - name: Remove cached torch extensions
+        run: rm -rf /github/home/.cache/torch_extensions/
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again* (for daily CI)
+        if: ${{ contains(inputs.ci_event, 'Daily CI') }}
+        working-directory: ${{ inputs.working-directory-prefix }}/
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again* (for nightly & Past CI)
+        if: ${{ contains(inputs.ci_event, 'Nightly CI') || contains(inputs.ci_event, 'Past CI') }}
+        working-directory: ${{ inputs.working-directory-prefix }}/
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          rm -rf DeepSpeed
+          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all tests on GPU
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+
+  run_quantization_torch_gpu:
+    if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      max-parallel: 4
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-quantization-latest-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run quantization tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
+
+  run_extract_warnings:
+    # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
+    if: ${{ always() && inputs.job == 'run_models_gpu' }}
+    name: Extract warnings in CI artifacts
+    runs-on: ubuntu-22.04
+    needs: [setup, run_models_gpu]
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
+      - name: Install transformers
+        run: pip install transformers
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Create output directory
+        run: mkdir warnings_in_ci
+
+      - uses: actions/download-artifact@v4
+        with:
+          path: warnings_in_ci
+
+      - name: Show artifacts
+        run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
+        working-directory: warnings_in_ci
+
+      - name: Extract warnings in CI artifacts
+        run: |
+          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
+          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
+
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: warnings_in_ci
+          path: warnings_in_ci/selected_warnings.json
+
  send_results:
    name: Slack Report
    needs: [
      setup,
      run_models_gpu,
+      run_trainer_and_fsdp_gpu,
+      run_pipelines_torch_gpu,
+      run_examples_gpu,
+      run_torch_cuda_extensions_gpu,
+      run_quantization_torch_gpu,
+      run_extract_warnings
    ]
    if: ${{ always() }}
    uses: ./.github/workflows/slack-report.yml
@ -141,3 +520,18 @@ jobs:
      report_repo_id: ${{ inputs.report_repo_id }}

    secrets: inherit
+
+  check_new_failures:
+    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
+    name: Check new failures
+    needs: send_results
+    uses: ./.github/workflows/check_failed_tests.yml
+    with:
+      docker: ${{ inputs.docker }}
+      start_sha: ${{ github.sha }}
+      job: ${{ inputs.job }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      ci_event: ${{ inputs.ci_event }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+
+    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -167,6 +167,3 @@ tags

 # ruff
 .ruff_cache
-
-# modular conversion
-*.modular_backup
--- a/2
+++ b/2
@ -86,11 +86,11 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
+	python utils/check_docstrings.py --fix_and_overwrite
 	python utils/check_modular_conversion.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_pipeline_typing.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
-	python utils/check_docstrings.py --fix_and_overwrite

 # Run tests for the library

--- a/README.md
+++ b/README.md
@ -44,7 +44,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
@ -242,7 +242,7 @@ pipeline(

 - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
 - The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
- The [example scripts](https://github.com/huggingface/transformers/tree/main/examples) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
+- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.

 ## 100 projects using Transformers

--- a/check_bad_commit.py
+++ b/check_bad_commit.py
@ -1,220 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-import re
-import subprocess
-
-import requests
-
-
-def create_script(target_test):
-    """Create a python script to be run by `git bisect run` to determine if `target_test` passes or fails.
-    If a test is not found in a commit, the script with exit code `0` (i.e. `Success`).
-
-    Args:
-        target_test (`str`): The test to check.
-
-    Returns:
-        `str`: The script to be run by `git bisect run`.
-    """
-
-    script = f"""
-import os
-import subprocess
-
-result = subprocess.run(
-    ["python3", "-m", "pytest", "-v", "-rfEp", f"{target_test}"],
-    capture_output = True,
-    text=True,
-)
-print(result.stdout)
-
-if f"PASSED {target_test}" in result.stdout:
-    print("test passed")
-    exit(0)
-elif len(result.stderr) > 0:
-    if "ERROR: file or directory not found: " in result.stderr:
-        print("test file or directory not found in this commit")
-        exit(0)
-    elif "ERROR: not found: " in result.stderr:
-        print("test not found in this commit")
-        exit(0)
-    else:
-        print(f"pytest failed to run: {{result.stderr}}")
-        exit(-1)
-elif f"FAILED {target_test}" in result.stdout:
-    print("test failed")
-    exit(2)
-
-exit(0)
-"""
-
-    with open("target_script.py", "w") as fp:
-        fp.write(script.strip())
-
-
-def find_bad_commit(target_test, start_commit, end_commit):
-    """Find (backward) the earliest commit between `start_commit` and `end_commit` at which `target_test` fails.
-
-    Args:
-        target_test (`str`): The test to check.
-        start_commit (`str`): The latest commit.
-        end_commit (`str`): The earliest commit.
-
-    Returns:
-        `str`: The earliest commit at which `target_test` fails.
-    """
-
-    if start_commit == end_commit:
-        return start_commit
-
-    create_script(target_test=target_test)
-
-    bash = f"""
-git bisect reset
-git bisect start {start_commit} {end_commit}
-git bisect run python3 target_script.py
-"""
-
-    with open("run_git_bisect.sh", "w") as fp:
-        fp.write(bash.strip())
-
-    result = subprocess.run(
-        ["bash", "run_git_bisect.sh"],
-        check=False,
-        capture_output=True,
-        text=True,
-    )
-    print(result.stdout)
-
-    if "error: bisect run failed" in result.stderr:
-        index = result.stderr.find("error: bisect run failed")
-        bash_error = result.stderr[index:]
-
-        error_msg = f"Error when running git bisect:\nbash error: {bash_error}"
-
-        pattern = "pytest failed to run: .+"
-        pytest_errors = re.findall(pattern, result.stdout)
-        if len(pytest_errors) > 0:
-            pytest_error = pytest_errors[0]
-            index = pytest_error.find("pytest failed to run: ")
-            index += len("pytest failed to run: ")
-            pytest_error = pytest_error[index:]
-            error_msg += f"pytest error: {pytest_error}"
-
-        raise ValueError(error_msg)
-
-    pattern = r"(.+) is the first bad commit"
-    commits = re.findall(pattern, result.stdout)
-
-    bad_commit = None
-    if len(commits) > 0:
-        bad_commit = commits[0]
-
-    print(f"Between `start_commit` {start_commit} and `end_commit` {end_commit}")
-    print(f"bad_commit: {bad_commit}\n")
-
-    return bad_commit
-
-
-def get_commit_info(commit):
-    """Get information for a commit via `api.github.com`."""
-    pr_number = None
-    author = None
-    merged_author = None
-
-    url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}/pulls"
-    pr_info_for_commit = requests.get(url).json()
-
-    if len(pr_info_for_commit) > 0:
-        pr_number = pr_info_for_commit[0]["number"]
-
-        url = f"https://api.github.com/repos/huggingface/transformers/pulls/{pr_number}"
-        pr_for_commit = requests.get(url).json()
-        author = pr_for_commit["user"]["login"]
-        if pr_for_commit["merged_by"] is not None:
-            merged_author = pr_for_commit["merged_by"]["login"]
-
-    if author is None:
-        url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}"
-        commit_info = requests.get(url).json()
-        author = commit_info["author"]["login"]
-
-    return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--start_commit", type=str, required=True, help="The latest commit hash to check.")
-    parser.add_argument("--end_commit", type=str, required=True, help="The earliest commit hash to check.")
-    parser.add_argument("--test", type=str, help="The test to check.")
-    parser.add_argument("--file", type=str, help="The report file.")
-    parser.add_argument("--output_file", type=str, required=True, help="The path of the output file.")
-    args = parser.parse_args()
-
-    print(f"start_commit: {args.start_commit}")
-    print(f"end_commit: {args.end_commit}")
-
-    # `get_commit_info` uses `requests.get()` to request info. via `api.github.com` without using token.
-    # If there are many new failed tests in a workflow run, this script may fail at some point with `KeyError` at
-    # `pr_number = pr_info_for_commit[0]["number"]` due to the rate limit.
-    # Let's cache the commit info. and reuse them whenever possible.
-    commit_info_cache = {}
-
-    if len({args.test is None, args.file is None}) != 2:
-        raise ValueError("Exactly one argument `test` or `file` must be specified.")
-
-    if args.test is not None:
-        commit = find_bad_commit(target_test=args.test, start_commit=args.start_commit, end_commit=args.end_commit)
-        with open(args.output_file, "w", encoding="UTF-8") as fp:
-            fp.write(f"{args.test}\n{commit}")
-    elif os.path.isfile(args.file):
-        with open(args.file, "r", encoding="UTF-8") as fp:
-            reports = json.load(fp)
-
-        for model in reports:
-            # TODO: make this script able to deal with both `single-gpu` and `multi-gpu` via a new argument.
-            reports[model].pop("multi-gpu", None)
-            failed_tests = reports[model]["single-gpu"]
-
-            failed_tests_with_bad_commits = []
-            for test in failed_tests:
-                commit = find_bad_commit(target_test=test, start_commit=args.start_commit, end_commit=args.end_commit)
-                info = {"test": test, "commit": commit}
-
-                if commit in commit_info_cache:
-                    commit_info = commit_info_cache[commit]
-                else:
-                    commit_info = get_commit_info(commit)
-                    commit_info_cache[commit] = commit_info
-
-                info.update(commit_info)
-                failed_tests_with_bad_commits.append(info)
-
-            # If no single-gpu test failures, remove the key
-            if len(failed_tests_with_bad_commits) > 0:
-                reports[model]["single-gpu"] = failed_tests_with_bad_commits
-            else:
-                reports[model].pop("single-gpu", None)
-
-        # remove the models without any test failure
-        reports = {k: v for k, v in reports.items() if len(v) > 0}
-
-        with open(args.output_file, "w", encoding="UTF-8") as fp:
-            json.dump(reports, fp, ensure_ascii=False, indent=4)
--- a/conftest.py
+++ b/conftest.py
@ -28,7 +28,6 @@ from transformers.testing_utils import HfDoctestModule, HfDocTestParser

 NOT_DEVICE_TESTS = {
    "test_tokenization",
-    "test_tokenization_mistral_common",
    "test_processor",
    "test_processing",
    "test_beam_constraints",
@ -84,8 +83,6 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
-    config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
-    config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")


 def pytest_collection_modifyitems(items):
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -30,8 +30,6 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] &&

 RUN python3 -m pip uninstall -y flax jax

-RUN python3 -m pip install --no-cache-dir -U timm
-
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,8 +1,11 @@
-FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
+FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

+ARG TORCH_VISION='0.21.0'
+ARG TORCH_AUDIO='2.6.0'
+
 RUN apt update && \
    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
    apt clean && \
@ -20,12 +23,9 @@ WORKDIR /
 ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-# On ROCm, torchcodec is required to decode audio files
-# RUN python3 -m pip install --no-cache-dir torchcodec
-# Install transformers
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video,audio]
+RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]

-# Remove tensorflow and flax as they are no longer supported by transformers
 RUN python3 -m pip uninstall -y tensorflow flax

 # When installing in editable mode, `transformers` is not recognized as a package.
@ -36,4 +36,4 @@ RUN cd transformers && python3 setup.py develop
 RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y

 # `kernels` may causes many failing tests
-RUN python3 -m pip uninstall -y kernels
+RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -78,9 +78,6 @@ RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submod
 # RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
 # RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git

-# Add fp-quant for quantization testing
-RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
-
 # Add compressed-tensors for quantization testing
 RUN python3 -m pip install --no-cache-dir compressed-tensors

--- a/docs/source/ar/custom_models.md
+++ b/docs/source/ar/custom_models.md
@ -280,7 +280,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 أو من دفتر ملاحظات:
--- a/docs/source/ar/llm_tutorial_optimization.md
+++ b/docs/source/ar/llm_tutorial_optimization.md
@ -13,11 +13,11 @@

 في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:

-1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization).
+1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization.md).

 2.  **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.

-3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)](https://huggingface.co/papers/2305.13245).
+3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)]((https://huggingface.co/papers/2305.13245)).

 على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.

--- a/docs/source/ar/model_sharing.md
+++ b/docs/source/ar/model_sharing.md
@ -41,7 +41,7 @@ picture-in-picture" allowfullscreen></iframe>
 قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك.  إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):

 ```bash
-hf auth login
+huggingface-cli login
 ```

 إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
--- a/docs/source/ar/run_scripts.md
+++ b/docs/source/ar/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
--- a/docs/source/de/model_sharing.md
+++ b/docs/source/de/model_sharing.md
@ -56,7 +56,7 @@ Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können
 Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:

 ```bash
-hf auth login
+huggingface-cli login
 ```

 Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -72,6 +72,8 @@
      title: Caching
    - local: kv_cache
      title: KV cache strategies
+    - local: serving
+      title: Serving
    - local: llm_tutorial_optimization
      title: Getting the most out of LLMs
    - local: perplexity
@ -103,10 +105,6 @@
    title: Agents
  - local: tools
    title: Tools
-  - local: serving
-    title: Serving
-  - local: transformers_as_backend
-    title: Inference server backends
  title: Inference
 - isExpanded: false
  sections:
@ -179,8 +177,6 @@
    title: FBGEMM
  - local: quantization/finegrained_fp8
    title: Fine-grained FP8
-  - local: quantization/fp_quant
-    title: FP-Quant
  - local: gguf
    title: GGUF
  - local: quantization/gptq
@ -445,16 +441,10 @@
        title: Encoder Decoder Models
      - local: model_doc/ernie
        title: ERNIE
-      - local: model_doc/ernie4_5
-        title: Ernie4_5
-      - local: model_doc/ernie4_5_moe
-        title: Ernie4_5_MoE
      - local: model_doc/ernie_m
        title: ErnieM
      - local: model_doc/esm
        title: ESM
-      - local: model_doc/exaone4
-        title: EXAONE-4.0
      - local: model_doc/falcon
        title: Falcon
      - local: model_doc/falcon3
@ -485,8 +475,6 @@
        title: GLM
      - local: model_doc/glm4
        title: glm4
-      - local: model_doc/glm4_moe
-        title: glm4_moe
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
@ -529,8 +517,6 @@
        title: Jukebox
      - local: model_doc/led
        title: LED
-      - local: model_doc/lfm2
-        title: LFM2
      - local: model_doc/llama
        title: LLaMA
      - local: model_doc/llama2
@ -575,8 +561,6 @@
        title: MobileBERT
      - local: model_doc/modernbert
        title: ModernBert
-      - local: model_doc/modernbert-decoder
-        title: ModernBERTDecoder
      - local: model_doc/mpnet
        title: MPNet
      - local: model_doc/mpt
@ -699,8 +683,6 @@
        title: XLM-V
      - local: model_doc/xlnet
        title: XLNet
-      - local: model_doc/xlstm
-        title: xLSTM
      - local: model_doc/yoso
        title: YOSO
      - local: model_doc/zamba
@ -729,10 +711,6 @@
        title: DAB-DETR
      - local: model_doc/deepseek_v2
        title: DeepSeek-V2
-      - local: model_doc/deepseek_vl
-        title: DeepseekVL
-      - local: model_doc/deepseek_vl_hybrid
-        title: DeepseekVLHybrid
      - local: model_doc/deformable_detr
        title: Deformable DETR
      - local: model_doc/deit
@ -759,8 +737,6 @@
        title: DPT
      - local: model_doc/efficientformer
        title: EfficientFormer
-      - local: model_doc/efficientloftr
-        title: EfficientLoFTR
      - local: model_doc/efficientnet
        title: EfficientNet
      - local: model_doc/eomt
@ -971,8 +947,6 @@
        title: CLIPSeg
      - local: model_doc/clvp
        title: CLVP
-      - local: model_doc/cohere2_vision
-        title: Cohere2Vision
      - local: model_doc/colpali
        title: ColPali
      - local: model_doc/colqwen2
@ -985,8 +959,6 @@
        title: Donut
      - local: model_doc/emu3
        title: Emu3
-      - local: model_doc/evolla
-        title: Evolla
      - local: model_doc/flava
        title: FLAVA
      - local: model_doc/gemma3
@ -1051,8 +1023,6 @@
        title: Mistral3
      - local: model_doc/mllama
        title: mllama
-      - local: model_doc/mm-grounding-dino
-        title: MM Grounding DINO
      - local: model_doc/nougat
        title: Nougat
      - local: model_doc/omdet-turbo
@ -1067,8 +1037,6 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
-      - local: model_doc/perception_lm
-        title: PerceptionLM
      - local: model_doc/phi4_multimodal
        title: Phi4 Multimodal
      - local: model_doc/pix2struct
@ -1121,8 +1089,6 @@
        title: Vision Text Dual Encoder
      - local: model_doc/visual_bert
        title: VisualBERT
-      - local: model_doc/voxtral
-        title: Voxtral
      - local: model_doc/xclip
        title: X-CLIP
      title: Multimodal models
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -60,11 +60,11 @@ You will see it prints "I just entered the attention computation" as many times

 ## Dynamically switching attention function

-You could dynamically change the model's attention function as well:
+You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:

 ```python
 # Back to use original sdpa implementation
-model.set_attn_implementation("sdpa")
+model.config._attn_implementation = "sdpa"

 model(torch.ones(1, 5, dtype=int))
 ```
@ -72,34 +72,6 @@ model(torch.ones(1, 5, dtype=int))
 and it will stop printing the statements, as it now uses the `sdpa` attention.  
 This allows to quickly change an attention function, without needing to reload the model!

-## Different attention per backbone in multimodal models
-
-For multimodal models different attention functions may work better for each backbone module. For example, some vision backbones perform better in fp32, but are incompatible with FlashAttention. To continue using FlashAttention while keeping the vision encoder in fp32, create a dict and map each config to an attention implementation as shown below.
-
-```python
-from transformers import AutoModelForImageTextToText
-
-model_id = "facebook/chameleon-7b"
-
-attention_implementation_per_backbone = {"vision_config": "sdpa", "text_config": "flash_attention_2"}
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation=attention_implementation_per_backbone)
-
-# NOTE: keys in the attention implementation have to be the same as the sub-config names
-for key in attention_implementation_per_backbone:
-    assert key in model.config.sub_configs, f"Invalid key in `attention_implementation`"
-
-# You can omit certain backbones - the default attention function (SDPA) will be used
-# This is equivalent to the previous example
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2"})
-
-
-# Set the same attention implementation for all backbones with single string, same as in non-multimodal models
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
-
-# Alternatively use a dict with an empty key for global configuration
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"": "eager"})
-```
-
 ## What about new args needed in my custom attention function?

 But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -64,9 +64,9 @@ Arguments can also be passed directly to `@auto_docstring` for more control. Use
    It builds upon the standard Transformer architecture with unique modifications.""",
    custom_args="""
    custom_parameter (`type`, *optional*, defaults to `default_value`):
-        A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
+        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-        A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
+        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
    """
 )
 class MySpecialModel(PreTrainedModel):
@ -85,40 +85,13 @@ class MySpecialModel(PreTrainedModel):
    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
        r"""
        custom_parameter (`type`, *optional*, defaults to `default_value`):
-            A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
+            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-            A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
+            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
        """
        # ...
 ```

-You should also use the `@auto_docstring` decorator for classes that inherit from [`~utils.ModelOutput`].
-
-```python
-@dataclass
-@auto_docstring(
-    custom_intro="""
-    Custom model outputs with additional fields.
-    """
-)
-class MyModelOutput(ImageClassifierOutput):
-    r"""
-    loss (`torch.FloatTensor`, *optional*):
-        The loss of the model.
-    custom_field (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
-        A custom output field specific to this model.
-    """
-
-    # Standard fields like hidden_states, logits, attentions etc. can be automatically documented if the description is the same as the standard arguments.
-    # However, given that the loss docstring is often different per model, you should document it in the docstring above.
-    loss: Optional[torch.FloatTensor] = None
-    logits: Optional[torch.FloatTensor] = None
-    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
-    # Custom fields need to be documented in the docstring above
-    custom_field: Optional[torch.FloatTensor] = None
-```
-
 </hfoption>
 <hfoption id="functions">

@ -198,7 +171,7 @@ class MyModel(PreTrainedModel):

 There are some rules for documenting different types of arguments and they're listed below.

- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `auto_docstring.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `auto_docstring.py`.
+- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `args_doc.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `args_doc.py`.

    If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.

@ -272,7 +245,7 @@ When working with modular files (`modular_model.py`), follow the guidelines belo
 The `@auto_docstring` decorator automatically generates docstrings by:

 1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
-2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `auto_docstring.py` file.
+2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `args_doc.py` file.
 3. Adding argument descriptions in one of two ways as shown below.

    | method | description | usage |
@ -280,7 +253,7 @@ The `@auto_docstring` decorator automatically generates docstrings by:
    | `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
    | `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |

-4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `auto_docstring.py`.
+4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `args_doc.py`.

    `@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.

--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -82,18 +82,24 @@ When you use Transformers' [`Cache`] class, the self-attention module performs s

 ## Cache storage implementation

-Caches are structured as a list of layers, where each layer contains a key and value cache. The key and value caches are tensors with the shape `[batch_size, num_heads, seq_len, head_dim]`.
+The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].

-Layers can be of different types (e.g. `DynamicLayer`, `StaticLayer`, `SlidingWindowLayer`), which mostly changes how sequence length is handled and how the cache is updated.

-The simplest is a `DynamicLayer` that grows as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token:
+In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
+- `key_cache`: A list of tensors, one for each layer.
+- `value_cache`: A list of tensors, one for each layer.

+When new tokens are processed:
+
+1. For each layer, the new key and value states are concatenated with the existing cache.
 ```py
-cache.layers[idx].keys = torch.cat([cache.layers[idx].keys, key_states], dim=-2)
-cache.layers[idx].values = torch.cat([cache.layers[idx].values, value_states], dim=-2)
+self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
 ```

-Other layer types like `StaticLayer` and `SlidingWindowLayer` have a fixed sequence length that is set when the cache is created. This makes them compatible with `torch.compile`. In the case of `SlidingWindowLayer`, existing tokens are shifted out of the cache when a new token is added.
+2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
+
+3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.

 The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.

@ -128,34 +134,6 @@ for _ in range(max_new_tokens):
 print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
 "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
 ```
-
-## Cache position
-
-The cache position tracks where to insert new tokens in the attention cache. It represents the *absolute* position of each token in the context, independent of padding or batch structure. Suppose you already cached `N` tokens and are now processing `K` new tokens. The cache position for the new tokens will range from `N` to `N + K - 1`. In other words, you're processing tokens at positions - `[N, N + 1, N + 2, ..., N + K - 1]`.
-
-Cache position is used internally for two purposes:
-
-1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
-2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, like [`StaticCache`], that pre-allocates a specific cache length.
-
-The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
-
-
-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
-
-model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-messages = [{"role": "user", "content": "You are a helpful assistant."}]
-inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
-generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
-
-```
-
-
 ## Legacy cache format

 Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
@ -165,7 +143,7 @@ The legacy format is essentially the same data structure but organized different
 - The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
 - The format is less flexible and doesn't support features like quantization or offloading.

-If your project depends on this legacy format, we recommend to convert to [`DynamicCache`] with [`~DynamicCache.from_legacy_cache`]. Note that legacy cache format is deprecated and not used anymore in `Transformers`. You can convert back to tuple format with [`DynamicCache.to_legacy_cache`] functions, which is helpful if you have custom logic for manipulating a cache in a specific format.
+If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.

 ```py
 import torch
@ -181,4 +159,4 @@ generation_outputs = model.generate(**inputs, return_dict_in_generate=True, retu

 cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
 legacy_format_cache = cache.to_legacy_cache()
-```
+```
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -27,7 +27,7 @@ This guide shows you how to quickly start chatting with Transformers from the co

 ## chat CLI

-After you've [installed Transformers](./installation), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
+After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
 transformers chat Qwen/Qwen2.5-0.5B-Instruct
@ -158,4 +158,4 @@ The easiest solution for improving generation speed is to either quantize a mode
 You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.

 > [!TIP]
-> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
+> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@ -271,7 +271,7 @@ The model is ready to be pushed to the Hub now. Log in to your Hugging Face acco
 <hfoption id="huggingface-CLI">

 ```bash
-hf auth login
+huggingface-cli login
 ```

 </hfoption>
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -356,93 +356,66 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 ## Caches

-[[autodoc]] CacheLayerMixin
-    - update
-    - get_seq_length
-    - get_mask_sizes
-    - get_max_cache_shape
-    - reset
-    - reorder_cache
-
-[[autodoc]] DynamicLayer
-    - update
-    - crop
-    - batch_repeat_interleave
-    - batch_select_indices
-
-[[autodoc]] StaticLayer
-    - update
-
-[[autodoc]] SlidingWindowLayer
-    - update
-
-[[autodoc]] CacheProcessor
-    - pre_update
-    - post_update
-
-[[autodoc]] OffloadedCacheProcessor
-    - pre_update
-
-[[autodoc]] QuantizedCacheProcessor
-    - post_update
-
-[[autodoc]] QuantoQuantizedCacheProcessor
-    - post_update
-
-[[autodoc]] HQQQuantizedCacheProcessor
-    - post_update
-
 [[autodoc]] Cache
    - update
-    - get_seq_length
-    - get_mask_sizes
-    - get_max_cache_shape
-    - reset
-    - reorder_cache
-    - crop
-    - batch_repeat_interleave
-    - batch_select_indices
+
+[[autodoc]] CacheConfig
+	- update
+
+[[autodoc]] QuantizedCacheConfig
+	- validate

 [[autodoc]] DynamicCache
+    - update
+    - get_seq_length
+    - reorder_cache
    - to_legacy_cache
    - from_legacy_cache

 [[autodoc]] QuantizedCache
+    - update
+    - get_seq_length

 [[autodoc]] QuantoQuantizedCache

-[[autodoc]] QuantoQuantizedCacheProcessor
-
 [[autodoc]] HQQQuantizedCache

-[[autodoc]] HQQQuantizedCacheProcessor
-
 [[autodoc]] OffloadedCache
+    - update
+    - prefetch_layer
+    - evict_previous_layer

 [[autodoc]] StaticCache
+    - update
+    - get_seq_length
+    - reset

 [[autodoc]] OffloadedStaticCache
+    - update
+    - get_seq_length
+    - reset

 [[autodoc]] HybridCache
-
-[[autodoc]] HybridChunkedCache
+    - update
+    - get_seq_length
+    - reset

 [[autodoc]] SlidingWindowCache
+    - update
+    - reset

 [[autodoc]] EncoderDecoderCache
+    - get_seq_length
    - to_legacy_cache
    - from_legacy_cache
+    - reset
+    - reorder_cache

 [[autodoc]] MambaCache
    - update_conv_state
    - update_ssm_state
    - reset

-[[autodoc]] CacheConfig
-
-[[autodoc]] QuantizedCacheConfig
-
-
 ## Watermark Utils

 [[autodoc]] WatermarkingConfig
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -247,114 +247,3 @@ first and last layer will be shown. This is useful when some layers (typically c
 layers.

 [[autodoc]] model_addition_debugger_context
-
-## Analyzer of skipped tests
-
-### Scan skipped tests - for model adders and maintainers
-
-This small util is a power user tool intended for model adders and maintainers. It lists all test methods
-existing in `test_modeling_common.py`, inherited by all model tester classes, and scans the repository to measure
-how many tests are being skipped and for which models. 
-
-### Rationale
-
-When porting models to transformers, tests fail as they should, and sometimes `test_modeling_common` feels irreconcilable with the peculiarities of our brand new model. But how can we be sure we're not breaking everything by adding a seemingly innocent skip?
-
-This utility:
- scans all test_modeling_common methods
- looks for times where a method is skipped
- returns a summary json you can load as a DataFrame/inspect
-
-**For instance test_inputs_embeds is skipped in a whooping 39% proportion at the time of writing this util.**
-
-![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/f7f671f69b88ce4967e19179172c248958d35742/transformers/tests_skipped_visualisation.png)
-
-
-### Usage 
-
-You can run the skipped test analyzer in two ways:
-
-#### Full scan (default)
-
-From the root of `transformers` repo, scans all common test methods and outputs the results to a JSON file (default: `all_tests_scan_result.json`).
-
-```bash
-python utils/scan_skipped_tests.py --output_dir path/to/output
-```
-
- `--output_dir` (optional): Directory where the JSON results will be saved. Defaults to the current directory.
-
-**Example output:**
-
-```
-🔬 Parsing 331 model test files once each...
-📝 Aggregating 224 tests...
-  (224/224) test_update_candidate_strategy_with_matches_1es_3d_is_nonecodet_schedule_fa_kwargs
-✅ Scan complete.
-
-📄 JSON saved to /home/pablo/git/transformers/all_tests_scan_result.json
-
-```
-
-And it will generate `all_tests_scan_result.json` file that you can inspect. The JSON is indexed by method name, and each entry follows this schema, indicating the origin as well (from `common`or `GenerationMixin`.)
-
-```json
-{
-  "<method_name>": {
-    "origin": "<test suite>"
-    "models_ran": ["<model_name>", ...],
-    "models_skipped": ["<model_name>", ...],
-    "skipped_proportion": <float>,
-    "reasons_skipped": ["<model_name>: <reason>",
-      ...
-    ]
-  },
-  ...
-}
-```
-
-Which you can visualise as above with e.g. `pandas`
-
-```python
-df = pd.read_json('all_tests_scan_result.json').T
-df.sort_values(by=['skipped_proportion'], ascending=False)
-
-```
-
-### Scan a single test method
-
-You can focus on a specific test method using `--test_method_name`:
-
-```bash
-$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
-```
-
- `--test_method_name`: Name of the test method to scan (e.g., `test_inputs_embeds`).
- `--output_dir` (optional): Directory where the JSON result will be saved.
-
-**Example output:**
-
-```bash
-$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds
-
-🔬 Parsing 331 model test files once each...
-
-== test_inputs_embeds ==
-
-Ran    : 199/323
-Skipped : 124/323 (38.4%)
- - aimv2: Aimv2 does not use inputs_embeds
- - align: Inputs_embeds is tested in individual model tests
- - altclip: Inputs_embeds is tested in individual model tests
- - audio_spectrogram_transformer: AST does not use inputs_embeds
- - beit: BEiT does not use inputs_embeds
- - bit: Bit does not use inputs_embeds
- - blip: Blip does not use inputs_embeds
- - blip_2: Inputs_embeds is tested in individual model tests
- - bridgetower: 
- - canine: CANINE does not have a get_input_embeddings() method.
- - ...
-
-📄 JSON saved to /home/pablo/git/transformers/scan_test_inputs_embeds.json
-
-```
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -134,7 +134,7 @@ The [`QuantizedCache`] reduces memory requirements by quantizing the KV values t
 > [!WARNING]
 > Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.

-Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and the quantization backend, as well as any additional quantization related parameters should also be passed either as a dict. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
+Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.

 <hfoptions id="quantized-cache">
 <hfoption id="HQQQuantizedCache">
@ -143,7 +143,7 @@ For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value`

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache
+from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@ -161,7 +161,7 @@ For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-valu

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache
+from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@ -275,6 +275,7 @@ from transformers.cache_utils import (
    StaticCache,
    SlidingWindowCache,
    QuantoQuantizedCache,
+    QuantizedCacheConfig,
 )

 model_id = "meta-llama/Llama-2-7b-chat-hf"
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -341,7 +341,7 @@ A known issue with transformer models is that the self-attention mechanism grows

 FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to the GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead.

-To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or set with `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface) after the model is loaded.
+To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`].

 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@ -353,14 +353,6 @@ model = AutoModelForCausalLM.from_pretrained(
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
 )
-
-# Change the model's attention dynamically after loading
-model = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2b",
-    quantization_config=quant_config,
-    torch_dtype=torch.bfloat16
-)
-model.set_attention_implementation("flash_attention_2")
 ```

 ### PyTorch scaled dot product attention
@ -368,7 +360,7 @@ model.set_attention_implementation("flash_attention_2")
 Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation.

 > [!TIP]
-> SDPA automatically supports FlashAttention-2 as long as you have the latest PyTorch version installed.
+> SDPA automaticallysupports FlashAttention-2 as long as you have the latest PyTorch version installed.

 Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -148,9 +148,9 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 | Option name | Type | Simplified description |
 |---|---|---|
 | `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
-| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies) for more information. |
+| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
 | `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
-| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies) for more information. |
+| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
 | `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
 | `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |

--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@ -23,11 +23,11 @@ The crux of these challenges lies in augmenting the computational and memory cap

 In this guide, we will go over the effective techniques for efficient LLM deployment:

-1.  **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization) can achieve computational advantages without a considerable decline in model performance.
+1.  **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.

 2.  **Flash Attention:** Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization.

-3.  **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)](https://huggingface.co/papers/2305.13245).
+3.  **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)]((https://huggingface.co/papers/2305.13245)).

 Throughout this guide, we will offer an analysis of auto-regressive generation from a tensor's perspective. We delve into the pros and cons of adopting lower precision, provide a comprehensive exploration of the latest attention algorithms, and discuss improved LLM architectures. While doing so, we run practical examples showcasing each of the feature improvements.

--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@ -33,7 +33,6 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
  it's the second one).
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
  or tensorboardX).
- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@ -73,9 +72,6 @@ Here is the list of the available [`TrainerCallback`] in the library:

 [[autodoc]] integrations.TensorBoardCallback

-[[autodoc]] integrations.TrackioCallback
-    - setup
-
 [[autodoc]] integrations.WandbCallback
    - setup

--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -93,10 +93,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] QuarkConfig

-## FPQuantConfig
-
-[[autodoc]] FPQuantConfig
-
 ## AutoRoundConfig

 [[autodoc]] AutoRoundConfig
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -258,10 +258,6 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForKeypointDetection

-### AutoModelForKeypointMatching
-
-[[autodoc]] AutoModelForKeypointMatching
-
 ### AutoModelForMaskedImageModeling

 [[autodoc]] AutoModelForMaskedImageModeling
--- a/docs/source/en/model_doc/barthez.md
+++ b/docs/source/en/model_doc/barthez.md
@ -14,81 +14,49 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
-    </div>
-</div>
-
 # BARThez

-[BARThez](https://huggingface.co/papers/2010.12321) is a [BART](./bart) model designed for French language tasks. Unlike existing French BERT models, BARThez includes a pretrained encoder-decoder, allowing it to generate text as well. This model is also available as a multilingual variant, mBARThez, by continuing pretraining multilingual BART on a French corpus.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>

-You can find all of the original BARThez checkpoints under the [BARThez](https://huggingface.co/collections/dascim/barthez-670920b569a07aa53e3b6887) collection.
+## Overview

-> [!TIP]
-> This model was contributed by [moussakam](https://huggingface.co/moussakam).
-> Refer to the [BART](./bart) docs for more usage examples.
+The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://huggingface.co/papers/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
+2020.
+
+The abstract of the paper:


-The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
+*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
+(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
+understanding tasks. While there are some notable exceptions, most of the available models and research have been
+conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
+(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
+that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
+CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
+its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
+summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
+pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
+provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+This model was contributed by [moussakam](https://huggingface.co/moussakam). The Authors' code can be found [here](https://github.com/moussaKam/BARThez).

-```py
-import torch
-from transformers import pipeline
+<Tip> 

-pipeline = pipeline(
-    task="fill-mask",
-    model="moussaKam/barthez",
-    torch_dtype=torch.float16,
-    device=0
-)
-pipeline("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.")
-```
+BARThez implementation is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on 
+configuration classes and their parameters. BARThez-specific tokenizers are documented below.  

-</hfoption>
-<hfoption id="AutoModel">
+</Tip>

-```py
-import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer
+## Resources

-tokenizer = AutoTokenizer.from_pretrained(
-    "moussaKam/barthez",
-)
-model = AutoModelForMaskedLM.from_pretrained(
-    "moussaKam/barthez",
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
-inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to("cuda")
+- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
+  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).

-with torch.no_grad():
-    outputs = model(**inputs)
-    predictions = outputs.logits
-
-masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
-predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
-predicted_token = tokenizer.decode(predicted_token_id)
-
-print(f"The predicted token is: {predicted_token}")
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "Les plantes produisent <mask> grâce à un processus appelé photosynthèse." | transformers run --task fill-mask --model moussaKam/barthez --device 0
-```
-
-</hfoption>
-</hfoptions>

 ## BarthezTokenizer

--- a/docs/source/en/model_doc/camembert.md
+++ b/docs/source/en/model_doc/camembert.md
@ -14,105 +14,49 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-	<div class="flex flex-wrap space-x-1">
-		<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-		<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-	</div>
-</div>
-
 # CamemBERT

-[CamemBERT](https://huggingface.co/papers/1911.03894) is a language model based on [RoBERTa](./roberta), but trained specifically on French text from the OSCAR dataset, making it more effective for French language tasks.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-What sets CamemBERT apart is that it learned from a huge, high quality collection of French data, as opposed to mixing lots of languages. This helps it really understand French better than many multilingual models.
+## Overview

-Common applications of CamemBERT include masked language modeling (Fill-mask prediction), text classification (sentiment analysis), token classification (entity recognition) and sentence pair classification (entailment tasks).
+The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://huggingface.co/papers/1911.03894) by
+[Louis Martin](https://huggingface.co/louismartin), [Benjamin Muller](https://huggingface.co/benjamin-mlr), [Pedro Javier Ortiz Suárez](https://huggingface.co/pjox), Yoann Dupont, Laurent Romary, Éric Villemonte de la
+Clergerie, [Djamé Seddah](https://huggingface.co/Djame), and [Benoît Sagot](https://huggingface.co/sagot). It is based on Facebook's RoBERTa model released in 2019. It is a model
+trained on 138GB of French text.

-You can find all the original CamemBERT checkpoints under the [ALMAnaCH](https://huggingface.co/almanach/models?search=camembert) organization.
+The abstract from the paper is the following:

-> [!TIP]
-> This model was contributed by the [ALMAnaCH (Inria)](https://huggingface.co/almanach) team.
->
-> Click on the CamemBERT models in the right sidebar for more examples of how to apply CamemBERT to different NLP tasks.
+*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
+models have either been trained on English data or on the concatenation of data in multiple languages. This makes
+practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
+we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
+performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
+dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
+for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
+downstream applications for French NLP.*

-The examples below demonstrate how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
+This model was contributed by [the ALMAnaCH team (Inria)](https://huggingface.co/almanach). The original code can be found [here](https://camembert-model.fr/).

-<hfoptions id="usage">
+<Tip>

-<hfoption id="Pipeline">
+This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples as well 
+as the information relative to the inputs and outputs.

-```python
-import torch
-from transformers import pipeline
+</Tip>

-pipeline = pipeline("fill-mask", model="camembert-base", torch_dtype=torch.float16, device=0)
-pipeline("Le camembert est un délicieux fromage <mask>.")
-```
-</hfoption> 
+## Resources

-<hfoption id="AutoModel">
-
-```python
-import torch
-from transformers import AutoTokenizer, AutoModelForMaskedLM
-
-tokenizer = AutoTokenizer.from_pretrained("camembert-base")
-model = AutoModelForMaskedLM.from_pretrained("camembert-base", torch_dtype="auto", device_map="auto", attn_implementation="sdpa")
-inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-    predictions = outputs.logits
-
-masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
-predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
-predicted_token = tokenizer.decode(predicted_token_id)
-
-print(f"The predicted token is: {predicted_token}")
-```
-</hfoption> 
-
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "Le camembert est un délicieux fromage <mask>." | transformers run --task fill-mask --model camembert-base --device 0
-```
-
-</hfoption> 
-
-</hfoptions> 
-
-
-Quantization reduces the memory burden of large models by representing weights in lower precision. Refer to the [Quantization](../quantization/overview) overview for available options.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) quantization to quantize the weights to 8-bits.
-  
-```python
-from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
-import torch
-
-quant_config = BitsAndBytesConfig(load_in_8bit=True)
-model = AutoModelForMaskedLM.from_pretrained(
-    "almanach/camembert-large",
-    quantization_config=quant_config,
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")
-
-inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-    predictions = outputs.logits
-
-masked_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
-predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
-predicted_token = tokenizer.decode(predicted_token_id)
-
-print(f"The predicted token is: {predicted_token}")
-```
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)

 ## CamembertConfig

@ -193,4 +137,5 @@ print(f"The predicted token is: {predicted_token}")
 [[autodoc]] TFCamembertForQuestionAnswering

 </tf>
-</frameworkcontent>
+</frameworkcontent>
+
--- a/docs/source/en/model_doc/clap.md
+++ b/docs/source/en/model_doc/clap.md
@ -14,50 +14,25 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-  <div class="flex flex-wrap space-x-1">
-    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-  </div>
-</div>
-
 # CLAP

-[CLAP (Contrastive Language-Audio Pretraining)](https://huggingface.co/papers/2211.06687) is a multimodal model that combines audio data with natural language descriptions through contrastive learning.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-It incorporates feature fusion and keyword-to-caption augmentation to process variable-length audio inputs and to improve performance. CLAP doesn't require task-specific training data and can learn meaningful audio representations through natural language.
+## Overview

-You can find all the original CLAP checkpoints under the [CLAP](https://huggingface.co/collections/laion/clap-contrastive-language-audio-pretraining-65415c0b18373b607262a490) collection.
+The CLAP model was proposed in [Large Scale Contrastive Language-Audio pretraining with
+feature fusion and keyword-to-caption augmentation](https://huggingface.co/papers/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.

-> [!TIP]
-> This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
->
-> Click on the CLAP models in the right sidebar for more examples of how to apply CLAP to different audio retrieval and classification tasks.
+CLAP (Contrastive Language-Audio Pretraining) is a neural network trained on a variety of (audio, text) pairs. It can be instructed in to predict the most relevant text snippet, given an audio, without directly optimizing for the task. The CLAP model uses a SWINTransformer to get audio features from a log-Mel spectrogram input, and a RoBERTa model to get text features. Both the text and audio features are then projected to a latent space with identical dimension. The dot product between the projected audio and text features is then used as a similar score.

-The example below demonstrates how to extract text embeddings with the [`AutoModel`] class.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="AutoModel">
+*Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-6*

-```python
-import torch
-from transformers import AutoTokenizer, AutoModel
-
-model = AutoModel.from_pretrained("laion/clap-htsat-unfused", torch_dtype=torch.float16, device_map="auto")
-tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
-
-texts = ["the sound of a cat", "the sound of a dog", "music playing"]
-
-inputs = tokenizer(texts, padding=True, return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    text_features = model.get_text_features(**inputs)
-
-print(f"Text embeddings shape: {text_features.shape}")
-print(f"Text embeddings: {text_features}")
-```
-
-</hfoption>
-</hfoptions>
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ) .
+The original code can be found [here](https://github.com/LAION-AI/Clap).

 ## ClapConfig

--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@ -1,115 +1,43 @@
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-    </div>
+# Cohere
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

+## Overview
+[C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages.

-# Cohere2
+The model features three layers with sliding window attention (window size 4096) and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence.

-[Cohere Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model. It is a multilingual model trained on 23 languages and has a context window of 128k. The model features three layers with sliding window attention and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence.
+The model has been trained on 23 languages: English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Arabic, Chinese, Russian, Polish, Turkish, Vietnamese, Dutch, Czech, Indonesian, Ukrainian, Romanian, Greek, Hindi, Hebrew, and Persian.

-This model is optimized for speed, cost-performance, and compute resources.
-
-You can find all the original Command-R checkpoints under the [Command Models](https://huggingface.co/collections/CohereForAI/command-models-67652b401665205e17b192ad) collection.
-
-
-> [!TIP]
-> Click on the Cohere models in the right sidebar for more examples of how to apply Cohere to different language tasks.
-
-The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class, and from the command line.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
+## Usage tips
+The model and tokenizer can be loaded via:

 ```python
-import torch
-from transformers import pipeline
-
-pipeline = pipeline(
-    task="text-generation", 
-    model="CohereLabs/c4ai-command-r7b-12-2024",
-    torch_dtype=torch.float16,
-    device_map=0
-)
-
-messages = [
-    {"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"},
-]
-pipeline(messages)
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```python
-import torch
+# pip install transformers
 from transformers import AutoTokenizer, AutoModelForCausalLM

-tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
-model = AutoModelForCausalLM.from_pretrained(
-    "CohereLabs/c4ai-command-r7b-12-2024", 
-    torch_dtype=torch.float16, 
-    device_map="auto", 
-    attn_implementation="sdpa"
-)
+model_id = "CohereForAI/c4ai-command-r7b-12-2024"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)

-# format message with the Command-R chat template
-messages = [{"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
-output = model.generate(
+# Format message with the command-r chat template
+messages = [{"role": "user", "content": "Hello, how are you?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+gen_tokens = model.generate(
    input_ids,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.3,
-    cache_implementation="static",
-)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-# pip install -U flash-attn --no-build-isolation
-transformers-cli chat CohereLabs/c4ai-command-r7b-12-2024 --torch_dtype auto --attn_implementation flash_attention_2
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview.md) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to 4-bits.
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
-
-bnb_config = BitsAndBytesConfig(load_in_4bit=True)
-tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
-model = AutoModelForCausalLM.from_pretrained(
-    "CohereLabs/c4ai-command-r7b-12-2024", 
-    torch_dtype=torch.float16, 
-    device_map="auto", 
-    quantization_config=bnb_config, 
-    attn_implementation="sdpa"
 )

-# format message with the Command-R chat template
-messages = [{"role": "user", "content": "Hello, can you please help me book a hotel in Japan?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
-output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    cache_implementation="static",
-)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
 ```

 ## Cohere2Config
--- a/docs/source/en/model_doc/cohere2_vision.md
+++ b/docs/source/en/model_doc/cohere2_vision.md
@ -1,123 +0,0 @@
-# Command A Vision
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-</div>
-
-## Overview
-
-Command A Vision is a state-of-the-art multimodal model designed to seamlessly integrate visual and textual information for a wide range of applications. By combining advanced computer vision techniques with natural language processing capabilities, Command A Vision enables users to analyze, understand, and generate insights from both visual and textual data.
-
-The model excels at tasks including image captioning, visual question answering, document understanding, and chart understanding. This makes it a versatile tool for AI practitioners. Its ability to process complex visual and textual inputs makes it useful in settings where text-only representations are imprecise or unavailable, like real-world image understanding and graphics-heavy document processing.
-
-Command A Vision is built upon a robust architecture that leverages the latest advancements in VLMs. It's highly performant and efficient, even when dealing with large-scale datasets. The model's flexibility makes it suitable for a wide range of use cases, from content moderation and image search to medical imaging analysis and robotics.
-
-## Usage tips
-
-The model and image processor can be loaded as follows:
-
-<hfoptions id="usage">
-<hfoption id="AutoModel">
-
-```python
-import torch
-
-from transformers import AutoProcessor, AutoModelForImageTextToText
-
-model_id = "CohereLabs/command-a-vision-07-2025"
-
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="auto", torch_dtype=torch.float16
-)
-
-# Format message with the Command-A-Vision chat template
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
-            },
-            {"type": "text", "text": "what is in this image?"},
-        ],
-    },
-]
-
-inputs = processor.apply_chat_template(
-    messages,
-    padding=True,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-).to(model.device)
-
-gen_tokens = model.generate(
-    **inputs,
-    max_new_tokens=300,
-    do_sample=True,
-    temperature=0.3,
-)
-
-print(
-    processor.tokenizer.decode(
-        gen_tokens[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
-    )
-)
-```
-
-</hfoption>
-<hfoption id="Pipeline">
-
-```python
-from transformers import pipeline
-
-pipe = pipeline(model="CohereLabs/command-a-vision-07-2025", task="image-text-to-text", device_map="auto")
-
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=",
-            },
-            {"type": "text", "text": "Where was this taken ?"},
-        ],
-    },
-]
-
-outputs = pipe(text=messages, max_new_tokens=300, return_full_text=False)
-
-print(outputs)
-```
-</hfoption>
-</hfoptions>
-
-## Cohere2VisionConfig
-
-[[autodoc]] Cohere2VisionConfig
-
-## Cohere2VisionForConditionalGeneration
-
-[[autodoc]] Cohere2VisionForConditionalGeneration
-    - forward
-
-## Cohere2VisionModel
-
-[[autodoc]] Cohere2VisionModel
-    - forward
-
-## Cohere2VisionImageProcessorFast
-
-[[autodoc]] Cohere2VisionImageProcessorFast
-    - preprocess
-
-## Cohere2VisionProcessor
-
-[[autodoc]] Cohere2VisionProcessor
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@ -95,7 +95,7 @@ images = [

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to int4.
+The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.

 ```python
 import requests
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -99,7 +99,7 @@ images = [

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to int4.
+The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.

 ```python
 import requests
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.

 **Model Architecture:**
-CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
+CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.

 The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.

--- a/docs/source/en/model_doc/deepseek_vl.md
+++ b/docs/source/en/model_doc/deepseek_vl.md
@ -1,224 +0,0 @@
-<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# DeepseekVL
-
-[Deepseek-VL](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding images.
-
-You can find all the original Deepseek-VL checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
-
-> [!TIP]
-> Click on the Deepseek-VL models in the right sidebar for more examples of how to apply Deepseek-VL to different vision and language tasks.
-
-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    task="image-text-to-text",
-    model="deepseek-community/deepseek-vl-1.3b-chat",
-    device=0,
-    torch_dtype=torch.float16
-)
-
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-            },
-            { "type": "text", "text": "Describe this image."},
-        ]
-    }
-]
-
-pipe(text=messages, max_new_tokens=20, return_full_text=False)
-```
-</hfoption>
-
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
-
-model = DeepseekVLForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-1.3b-chat",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    attn_implementation="sdpa"
-)
-
-processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
-
-messages = [
-    {
-        "role":"user",
-        "content":[
-            {
-                "type":"image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-            },
-            {
-                "type":"text",
-                "text":"Describe this image."
-            }
-        ]
-    }
-
-]
-
-inputs = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device, dtype=model.dtype)
-
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-
-print(output_text)
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
-
-```python
-import torch
-from transformers import TorchAoConfig, DeepseekVLForConditionalGeneration, AutoProcessor
-
-quantization_config = TorchAoConfig(
-    "int4_weight_only",
-    group_size=128
-)
-
-model = DeepseekVLForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-1.3b-chat",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-```
-### Notes
-
- Do inference with multiple images in a single conversation.
-    ```py
-    import torch
-    from transformers import DeepseekVLForConditionalGeneration, AutoProcessor
-
-    model = DeepseekVLForConditionalGeneration.from_pretrained(
-        "deepseek-community/deepseek-vl-1.3b-chat",
-        torch_dtype=torch.float16,
-        device_map="auto",
-        attn_implementation="sdpa"
-    )
-
-    processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-1.3b-chat")
-
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What’s the difference between"},
-                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-                    {"type": "text", "text": " and "},
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
-                ]
-            }
-        ],
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
-                    {"type": "text", "text": "What do you see in this image?"}
-                ]
-            }
-        ]
-    ]
-
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        padding=True,
-        truncation=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device, dtype=model.dtype)
-
-    generated_ids = model.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-
-    print(output_text)
-    ```
-
-## DeepseekVLConfig
-
-[[autodoc]] DeepseekVLConfig
-
-## DeepseekVLProcessor
-
-[[autodoc]] DeepseekVLProcessor
-
-## DeepseekVLImageProcessor
-
-[[autodoc]] DeepseekVLImageProcessor
-
-## DeepseekVLImageProcessorFast
-
-[[autodoc]] DeepseekVLImageProcessorFast
-
-## DeepseekVLModel
-
-[[autodoc]] DeepseekVLModel
-    - forward
-
-## DeepseekVLForConditionalGeneration
-
-[[autodoc]] DeepseekVLForConditionalGeneration
-    - forward
--- a/docs/source/en/model_doc/deepseek_vl_hybrid.md
+++ b/docs/source/en/model_doc/deepseek_vl_hybrid.md
@ -1,223 +0,0 @@
-<!--Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# DeepseekVLHybrid
-
-[Deepseek-VL-Hybrid](https://arxiv.org/abs/2403.05525) was introduced by the DeepSeek AI team. It is a vision-language model (VLM) designed to process both text and images for generating contextually relevant responses. The model leverages [LLaMA](./llama) as its text encoder, while [SigLip](./siglip) is used for encoding low-resolution images and [SAM (Segment Anything Model)](./sam) is incorporated to handle high-resolution image encoding, enhancing the model’s ability to process fine-grained visual details. Deepseek-VL-Hybrid is a variant of Deepseek-VL that uses [SAM (Segment Anything Model)](./sam) to handle high-resolution image encoding.
-
-You can find all the original Deepseek-VL-Hybrid checkpoints under the [DeepSeek-community](https://huggingface.co/deepseek-community) organization.
-
-> [!TIP]
-> Click on the Deepseek-VL-Hybrid models in the right sidebar for more examples of how to apply Deepseek-VL-Hybrid to different vision and language tasks.
-
-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    task="image-text-to-text",
-    model="deepseek-community/deepseek-vl-7b-chat",
-    device=0,
-    torch_dtype=torch.float16
-)
-
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-            },
-            { "type": "text", "text": "Describe this image."},
-        ]
-    }
-]
-
-pipe(text=messages, max_new_tokens=20, return_full_text=False)
-```
-</hfoption>
-
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
-
-model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-7b-chat",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    attn_implementation="sdpa"
-)
-
-processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
-
-messages = [
-    {
-        "role":"user",
-        "content":[
-            {
-                "type":"image",
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-            },
-            {
-                "type":"text",
-                "text":"Describe this image."
-            }
-        ]
-    }
-
-]
-
-inputs = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device, dtype=model.dtype)
-
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-
-print(output_text)
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
-
-```python
-import torch
-from transformers import TorchAoConfig, DeepseekVLHybridForConditionalGeneration, AutoProcessor
-
-quantization_config = TorchAoConfig(
-    "int4_weight_only",
-    group_size=128
-)
-
-model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
-    "deepseek-community/deepseek-vl-7b-chat",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-```
-### Notes
-
- Do inference with multiple images in a single conversation.
-    ```py
-    import torch
-    from transformers import DeepseekVLHybridForConditionalGeneration, AutoProcessor
-
-    model = DeepseekVLHybridForConditionalGeneration.from_pretrained(
-        "deepseek-community/deepseek-vl-7b-chat",
-        torch_dtype=torch.float16,
-        device_map="auto",
-        attn_implementation="sdpa"
-    )
-
-    processor = AutoProcessor.from_pretrained("deepseek-community/deepseek-vl-7b-chat")
-
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What’s the difference between"},
-                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-                    {"type": "text", "text": " and "},
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
-                ]
-            }
-        ],
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
-                    {"type": "text", "text": "What do you see in this image?"}
-                ]
-            }
-        ]
-    ]
-
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        padding=True,
-        truncation=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device, dtype=model.dtype)
-
-    generated_ids = model.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-
-    print(output_text)
-    ```
-
-## DeepseekVLHybridConfig
-
-[[autodoc]] DeepseekVLHybridConfig
-
-## DeepseekVLHybridProcessor
-
-[[autodoc]] DeepseekVLHybridProcessor
-
-## DeepseekVLHybridImageProcessor
-
-[[autodoc]] DeepseekVLHybridImageProcessor
-
-## DeepseekVLHybridImageProcessorFast
-
-[[autodoc]] DeepseekVLHybridImageProcessorFast
-
-## DeepseekVLHybridModel
-
-[[autodoc]] DeepseekVLHybridModel
-    - forward
-
-## DeepseekVLHybridForConditionalGeneration
-
-[[autodoc]] DeepseekVLHybridForConditionalGeneration
-    - forward
--- a/docs/source/en/model_doc/dia.md
+++ b/docs/source/en/model_doc/dia.md
@ -26,14 +26,14 @@ rendered properly in your Markdown viewer.

 ## Overview

-Dia is an open-source text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
-It can generate highly realistic dialogue from transcript including non-verbal communications such as laughter and coughing.
+Dia is an opensource text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
+It can generate highly realistic dialogue from transcript including nonverbal communications such as laughter and coughing.
 Furthermore, emotion and tone control is also possible via audio conditioning (voice cloning).

 **Model Architecture:**
 Dia is an encoder-decoder transformer based on the original transformer architecture. However, some more modern features such as
 rotational positional embeddings (RoPE) are also included. For its text portion (encoder), a byte tokenizer is utilized while
-for the audio portion (decoder), a pretrained codec model [DAC](./dac) is used - DAC encodes speech into discrete codebook
+for the audio portion (decoder), a pretrained codec model [DAC](./dac.md) is used - DAC encodes speech into discrete codebook
 tokens and decodes them back into audio.

 ## Usage Tips
--- a/docs/source/en/model_doc/efficientloftr.md
+++ b/docs/source/en/model_doc/efficientloftr.md
@ -1,149 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the MIT License; you may not use this file except in compliance with
-the License.
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-    </div>
-</div>
-
-# EfficientLoFTR
-
-[EfficientLoFTR](https://huggingface.co/papers/2403.04765) is an efficient detector-free local feature matching method that produces semi-dense matches across images with sparse-like speed. It builds upon the original [LoFTR](https://huggingface.co/papers/2104.00680) architecture but introduces significant improvements for both efficiency and accuracy. The key innovation is an aggregated attention mechanism with adaptive token selection that makes the model ~2.5× faster than LoFTR while achieving higher accuracy. EfficientLoFTR can even surpass state-of-the-art efficient sparse matching pipelines like [SuperPoint](./superpoint) + [LightGlue](./lightglue) in terms of speed, making it suitable for large-scale or latency-sensitive applications such as image retrieval and 3D reconstruction.
-
-> [!TIP]
-> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
->
-> Click on the EfficientLoFTR models in the right sidebar for more examples of how to apply EfficientLoFTR to different computer vision tasks.
-
-The example below demonstrates how to match keypoints between two images with the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="AutoModel">
-
-```py
-from transformers import AutoImageProcessor, AutoModelForKeypointMatching
-import torch
-from PIL import Image
-import requests
-
-url_image1 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg"
-image1 = Image.open(requests.get(url_image1, stream=True).raw)
-url_image2 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg"
-image2 = Image.open(requests.get(url_image2, stream=True).raw)
-
-images = [image1, image2]
-
-processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
-model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")
-
-inputs = processor(images, return_tensors="pt")
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# Post-process to get keypoints and matches
-image_sizes = [[(image.height, image.width) for image in images]]
-processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-```
-
-</hfoption>
-</hfoptions>
-
-## Notes
-
- EfficientLoFTR is designed for efficiency while maintaining high accuracy. It uses an aggregated attention mechanism with adaptive token selection to reduce computational overhead compared to the original LoFTR.
-
-    ```py
-    from transformers import AutoImageProcessor, AutoModelForKeypointMatching
-    import torch
-    from PIL import Image
-    import requests
-    
-    processor = AutoImageProcessor.from_pretrained("zju-community/efficientloftr")
-    model = AutoModelForKeypointMatching.from_pretrained("zju-community/efficientloftr")
-    
-    # EfficientLoFTR requires pairs of images
-    images = [image1, image2]
-    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
-    
-    # Extract matching information
-    keypoints = outputs.keypoints        # Keypoints in both images
-    matches = outputs.matches            # Matching indices 
-    matching_scores = outputs.matching_scores  # Confidence scores
-    ```
-
- The model produces semi-dense matches, offering a good balance between the density of matches and computational efficiency. It excels in handling large viewpoint changes and texture-poor scenarios.
-
- For better visualization and analysis, use the [`~EfficientLoFTRImageProcessor.post_process_keypoint_matching`] method to get matches in a more readable format.
-
-    ```py
-    # Process outputs for visualization
-    image_sizes = [[(image.height, image.width) for image in images]]
-    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-    
-    for i, output in enumerate(processed_outputs):
-        print(f"For the image pair {i}")
-        for keypoint0, keypoint1, matching_score in zip(
-                output["keypoints0"], output["keypoints1"], output["matching_scores"]
-        ):
-            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
-    ```
-
- Visualize the matches between the images using the built-in plotting functionality.
-
-    ```py
-    # Easy visualization using the built-in plotting method
-    visualized_images = processor.visualize_keypoint_matching(images, processed_outputs)
-    ```
-
- EfficientLoFTR uses a novel two-stage correlation layer that achieves accurate subpixel correspondences, improving upon the original LoFTR's fine correlation module.
-
-<div class="flex justify-center">
-    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/2nJZQlFToCYp_iLurvcZ4.png">
-</div>
-
-## Resources
-
- Refer to the [original EfficientLoFTR repository](https://github.com/zju3dv/EfficientLoFTR) for more examples and implementation details.
- [EfficientLoFTR project page](https://zju3dv.github.io/efficientloftr/) with interactive demos and additional information.
-
-## EfficientLoFTRConfig
-
-[[autodoc]] EfficientLoFTRConfig
-
-## EfficientLoFTRImageProcessor
-
-[[autodoc]] EfficientLoFTRImageProcessor
-
- preprocess
- post_process_keypoint_matching
- visualize_keypoint_matching
-
-<frameworkcontent>
-<pt>
-## EfficientLoFTRModel
-
-[[autodoc]] EfficientLoFTRModel
-
- forward
-
-## EfficientLoFTRForKeypointMatching
-
-[[autodoc]] EfficientLoFTRForKeypointMatching
-
- forward
-
-</pt>
-</frameworkcontent>
--- a/docs/source/en/model_doc/encodec.md
+++ b/docs/source/en/model_doc/encodec.md
@ -47,8 +47,7 @@ Here is a quick example of how to encode and decode an audio using this model:
 >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

 >>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
->>> # `encoder_outputs.audio_codes` contains discrete codes
->>> audio_values = model.decode(**encoder_outputs, padding_mask=inputs["padding_mask"])[0]
+>>> audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]
 >>> # or the equivalent with a forward pass
 >>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
 ```
--- a/docs/source/en/model_doc/encoder-decoder.md
+++ b/docs/source/en/model_doc/encoder-decoder.md
@ -14,88 +14,115 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
 # Encoder Decoder Models

-[`EncoderDecoderModel`](https://huggingface.co/papers/1706.03762) initializes a sequence-to-sequence model with any pretrained autoencoder and pretrained autoregressive model. It is effective for sequence generation tasks as demonstrated in [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) which uses [`BertModel`] as the encoder and decoder.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-> [!TIP]
-> This model was contributed by [thomwolf](https://huggingface.co/thomwolf) and the TensorFlow/Flax version by [ydshieh](https://huggingface.co/ydshieh).
->
-> Click on the Encoder Decoder models in the right sidebar for more examples of how to apply Encoder Decoder to different language tasks.
+## Overview

-The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
+The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
+pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks
+was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://huggingface.co/papers/1907.12461) by
+Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+
+After such an [`EncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like
+any other models (see the examples for more information).
+
+An application of this architecture could be to leverage two pretrained [`BertModel`] as the encoder
+and decoder for a summarization model as was shown in: [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) by Yang Liu and Mirella Lapata.
+
+## Randomly initializing `EncoderDecoderModel` from model configurations.
+
+[`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`BertModel`] configuration for the encoder and the default [`BertForCausalLM`] configuration for the decoder.

 ```python
-from transformers import pipeline
+>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

-summarizer = pipeline(
-    "summarization",
-    model="patrickvonplaten/bert2bert-cnn_dailymail-fp16",
-    device=0
-)
+>>> config_encoder = BertConfig()
+>>> config_decoder = BertConfig()

-text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
-print(summarizer(text))
+>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+>>> model = EncoderDecoderModel(config=config)
 ```

-</hfoption>
-<hfoption id="AutoModel">
+## Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
+
+[`EncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained auto-encoding model, *e.g.* BERT, can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
+Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
+Initializing [`EncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
+To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_encoder_decoder_pretrained`] method.

 ```python
-import torch  
-from transformers import AutoModelForCausalLM, AutoTokenizer  
+>>> from transformers import EncoderDecoderModel, BertTokenizer

-tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
-model = AutoModelForCausalLM.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", torch_dtype=torch.bfloat16, device_map="auto",attn_implementation="sdpa")  
-
-text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
-
-inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
-
-summary = model.generate(**inputs, max_length=60, num_beams=4, early_stopping=True)
-print(tokenizer.decode(summary[0], skip_special_tokens=True))
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
 ```

-</hfoption>
-<hfoption id="transformers CLI">
+## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.

-```bash
-echo -e "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen." | transformers-cli run --task summarization --model "patrickvonplaten/bert2bert-cnn_dailymail-fp16" --device 0
-```
+To load fine-tuned checkpoints of the `EncoderDecoderModel` class, [`EncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.

-</hfoption>
-</hfoptions>
-
-## Notes
-
- [`EncoderDecoderModel`] can be initialized using any pretrained encoder and decoder. But depending on the decoder architecture, the cross-attention layers may be randomly initialized.
-
-These models require downstream fine-tuning, as discussed in this [blog post](https://huggingface.co/blog/warm-starting-encoder-decoder). Use [`~EncoderDecoderModel.from_encoder_decoder_pretrained`] to combine encoder and decoder checkpoints.
+To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.

 ```python
-from transformers import EncoderDecoderModel, BertTokenizer
+>>> from transformers import AutoTokenizer, EncoderDecoderModel

-tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-model = EncoderDecoderModel.from_encoder_decoder_pretrained(
-    "google-bert/bert-base-uncased", 
-    "google-bert/bert-base-uncased"
-)
+>>> # load a fine-tuned seq2seq model and corresponding tokenizer
+>>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
+>>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
+
+>>> # let's perform inference on a long piece of text
+>>> ARTICLE_TO_SUMMARIZE = (
+...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+... )
+>>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids
+
+>>> # autoregressively generate summary (uses greedy decoding by default)
+>>> generated_ids = model.generate(input_ids)
+>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+>>> print(generated_text)
+nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow.
 ```

- Encoder Decoder models can be fine-tuned like BART, T5 or any other encoder-decoder model. Only 2 inputs are required to compute a loss, `input_ids` and `labels`. Refer to this [notebook](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for a more detailed training example.
+## Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.
+
+[`TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
+pytorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only pytorch
+checkpoints for a particular encoder-decoder model, a workaround is:
+
+```python
+>>> # a workaround to load from pytorch checkpoint
+>>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel
+
+>>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+
+>>> _model.encoder.save_pretrained("./encoder")
+>>> _model.decoder.save_pretrained("./decoder")
+
+>>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
+...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
+... )
+>>> # This is only for copying some specific attributes of this particular model.
+>>> model.config = _model.config
+```
+
+## Training
+
+Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model.
+As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
+`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
+target sequence).

 ```python
 >>> from transformers import BertTokenizer, EncoderDecoderModel
@ -120,42 +147,11 @@ model = EncoderDecoderModel.from_encoder_decoder_pretrained(
 >>> loss = model(input_ids=input_ids, labels=labels).loss
 ```

- [`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config as shown below.
+Detailed [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for training.

-```python
->>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+This model was contributed by [thomwolf](https://github.com/thomwolf). This model's TensorFlow and Flax versions
+were contributed by [ydshieh](https://github.com/ydshieh).

->>> config_encoder = BertConfig()
->>> config_decoder = BertConfig()
-
->>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
->>> model = EncoderDecoderModel(config=config)
-```
-
- The Encoder Decoder Model can also be used for translation as shown below.
-
-```python
-from transformers import AutoTokenizer, EncoderDecoderModel  
-
-# Load a pre-trained translation model  
-model_name = "google/bert2bert_L-24_wmt_en_de" 
-tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token="<pad>", eos_token="</s>", bos_token="<s>")  
-model = EncoderDecoderModel.from_pretrained(model_name)  
-
-# Input sentence to translate  
-input_text = "Plants create energy through a process known as"  
-
-# Encode the input text  
-inputs = tokenizer(input_text, return_tensors="pt", add_special_tokens=False).input_ids  
-
-# Generate the translated output  
-outputs = model.generate(inputs)[0]  
-
-# Decode the output tokens to get the translated sentence  
-translated_text = tokenizer.decode(outputs, skip_special_tokens=True)  
-
-print("Translated text:", translated_text)  
-```

 ## EncoderDecoderConfig

--- a/docs/source/en/model_doc/ernie.md
+++ b/docs/source/en/model_doc/ernie.md
@ -14,83 +14,29 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-    </div>
-</div>
-
 # ERNIE

-[ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
-[ERNIE3.0](https://arxiv.org/abs/2107.02137), [ERNIE-Gram](https://arxiv.org/abs/2010.12148), [ERNIE-health](https://arxiv.org/abs/2110.07244) are a series of powerful models proposed by baidu, especially in Chinese tasks.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-ERNIE (Enhanced Representation through kNowledge IntEgration) is designed to learn language representation enhanced by knowledge masking strategies, which includes entity-level masking and phrase-level masking.
+## Overview
+ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks,
+including [ERNIE1.0](https://huggingface.co/papers/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
+[ERNIE3.0](https://huggingface.co/papers/2107.02137), [ERNIE-Gram](https://huggingface.co/papers/2010.12148), [ERNIE-health](https://huggingface.co/papers/2110.07244), etc.

-Other ERNIE models released by baidu can be found at [Ernie 4.5](./ernie4_5), and [Ernie 4.5 MoE](./ernie4_5_moe).
+These models are contributed by [nghuyong](https://huggingface.co/nghuyong) and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).

-> [!TIP]
-> This model was contributed by [nghuyong](https://huggingface.co/nghuyong), and the official code can be found in [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) (in PaddlePaddle).
->
-> Click on the ERNIE models in the right sidebar for more examples of how to apply ERNIE to different language tasks.
+### Usage example
+Take `ernie-1.0-base-zh` as an example:

-The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-from transformers import pipeline
-
-pipeline = pipeline(
-    task="fill-mask",
-    model="nghuyong/ernie-3.0-xbase-zh"
-)
-
-pipeline("巴黎是[MASK]国的首都。")
+```Python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
+model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh")
 ```

-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "nghuyong/ernie-3.0-xbase-zh",
-)
-model = AutoModelForMaskedLM.from_pretrained(
-    "nghuyong/ernie-3.0-xbase-zh",
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-inputs = tokenizer("巴黎是[MASK]国的首都。", return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-    predictions = outputs.logits
-
-masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
-predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
-predicted_token = tokenizer.decode(predicted_token_id)
-
-print(f"The predicted token is: {predicted_token}")
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "巴黎是[MASK]国的首都。" | transformers run --task fill-mask --model nghuyong/ernie-3.0-xbase-zh --device 0
-```
-
-</hfoption>
-</hfoptions>
-
-## Notes
-
-Model variants are available in different sizes and languages.
+### Model checkpoints

 |     Model Name      | Language |           Description           |
 |:-------------------:|:--------:|:-------------------------------:|
@ -105,11 +51,18 @@ Model variants are available in different sizes and languages.
 |   ernie-health-zh   | Chinese  | Layer:12, Heads:12, Hidden:768  |
 |    ernie-gram-zh    | Chinese  | Layer:12, Heads:12, Hidden:768  |

-## Resources
-
 You can find all the supported models from huggingface's model hub: [huggingface.co/nghuyong](https://huggingface.co/nghuyong), and model details from paddle's official
 repo: [PaddleNLP](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers/ERNIE/contents.html)
-and [ERNIE's legacy branch](https://github.com/PaddlePaddle/ERNIE/tree/legacy/develop).
+and [ERNIE](https://github.com/PaddlePaddle/ERNIE/blob/repro).
+
+## Resources
+
+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)

 ## ErnieConfig

@ -163,4 +116,4 @@ and [ERNIE's legacy branch](https://github.com/PaddlePaddle/ERNIE/tree/legacy/de
 ## ErnieForQuestionAnswering

 [[autodoc]] ErnieForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/ernie4_5.md
+++ b/docs/source/en/model_doc/ernie4_5.md
@ -1,99 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-    </div>
-</div>
-
-# Ernie 4.5
-
-## Overview
-
-The Ernie 4.5 model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
-This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
-model without mixture of experts (moe) with 0.3B parameters in total. It uses the standard [Llama](./llama) at its core.
-
-Other models from the family can be found at [Ernie 4.5 Moe](./ernie4_5_moe).
-
-<div class="flex justify-center">
-    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
-</div>
-
-
-## Usage Tips
-
-### Generate text
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-0.3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
-The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
-
-
-## Ernie4_5Config
-
-[[autodoc]] Ernie4_5Config
-
-## Ernie4_5Model
-
-[[autodoc]] Ernie4_5Model
-    - forward
-
-## Ernie4_5ForCausalLM
-
-[[autodoc]] Ernie4_5ForCausalLM
-    - forward
--- a/docs/source/en/model_doc/ernie4_5_moe.md
+++ b/docs/source/en/model_doc/ernie4_5_moe.md
@ -1,183 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
-    </div>
-</div>
-
-# Ernie 4.5 Moe
-
-## Overview
-
-The Ernie 4.5 Moe model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
-This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
-model with mixture of experts (moe) - one with 21B total, 3B active parameters and another one with 300B total, 47B active parameters.
-It uses the standard [Llama](./llama) at its core combined with a specialized MoE based on [Mixtral](./mixtral) with additional shared
-experts.
-
-Other models from the family can be found at [Ernie 4.5](./ernie4_5).
-
-<div class="flex justify-center">
-    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
-</div>
-
-
-## Usage Tips
-
-### Generate text
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-### Distributed Generation with Tensor Parallelism
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-    tp_plan="auto",
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-### Quantization with Bitsandbytes
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
-
-model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
-
-# load the tokenizer and the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    quantization_config=BitsAndBytesConfig(load_in_4bit=True),
-)
-
-# prepare the model input
-inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
-prompt = "Hey, are you conscious? Can you talk to me?"
-messages = [
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
-
-# conduct text completion
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32,
-)
-output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-
-# decode the generated ids
-generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
-```
-
-This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
-The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
-
-
-## Ernie4_5_MoeConfig
-
-[[autodoc]] Ernie4_5_MoeConfig
-
-## Ernie4_5_MoeModel
-
-[[autodoc]] Ernie4_5_MoeModel
-    - forward
-
-## Ernie4_5_MoeForCausalLM
-
-[[autodoc]] Ernie4_5_MoeForCausalLM
-    - forward
-    - generate
--- a/docs/source/en/model_doc/evolla.md
+++ b/docs/source/en/model_doc/evolla.md
@ -1,95 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Evolla
-
-## Overview
-
-The Evolla model was proposed in [Decoding the Molecular Language of Proteins with Evolla](https://doi.org/10.1101/2025.01.05.630192) by [Zhou et al.](https://doi.org/10.1101/2025.01.05.630192).
-
-Evolla is an advanced 80-billion-parameter protein-language generative model designed to decode the molecular language of proteins. It integrates information from protein sequences, structures, and user queries to generate precise and contextually nuanced insights into protein function. Trained on an unprecedented AI-generated dataset of 546 million protein question-answer pairs and 150 billion word tokens, Evolla significantly advances research in proteomics and functional genomics, providing expert-level insights and shedding light on the molecular logic encoded in proteins.
-
-The abstract from the paper is the following:
-
-*Proteins, nature’s intricate molecular machines, are the products of billions of years of evolution and play fundamental roles in sustaining life. Yet, deciphering their molecular language - that is, understanding how protein sequences and structures encode and determine biological functions - remains a corner-stone challenge in modern biology. Here, we introduce Evolla, an 80 billion frontier protein-language generative model designed to decode the molecular language of proteins. By integrating information from protein sequences, structures, and user queries, Evolla generates precise and contextually nuanced insights into protein function. A key innovation of Evolla lies in its training on an unprecedented AI-generated dataset: 546 million protein question-answer pairs and 150 billion word tokens, designed to reflect the immense complexity and functional diversity of proteins. Post-pretraining, Evolla integrates Direct Preference Optimization (DPO) to refine the model based on preference signals and Retrieval-Augmented Generation (RAG) for external knowledge incorporation, improving response quality and relevance. To evaluate its performance, we propose a novel framework, Instructional Response Space (IRS), demonstrating that Evolla delivers expert-level insights, advancing research in proteomics and functional genomics while shedding light on the molecular logic encoded in proteins. The online demo is available at http://www.chat-protein.com/.*
-
-Examples:
-
-```python
-processor = EvollaProcessor.from_pretrained("westlake-repl/Evolla-10B-DPO-hf")
-model = EvollaForProteinText2Text.from_pretrained("westlake-repl/Evolla-10B-DPO-hf")
-# aa_seq should have same length as foldseek
-protein_inputs = [
-    {
-        
-        "aa_seq": "MATGGRRG...",
-        "foldseek": "###lqpfd...", # hashtag means the low-confidence foldseek tokens
-    },
-    {
-        "aa_seq": "MLPGLALL...",
-        "foldseek": "dfwwkwad...",
-    }
-]
-message_list = [
-    [
-        {
-            "role": "system",
-            "content": "You are an AI expert that can answer any questions about protein.",
-        },
-        {"role": "user", "content": "What is the function of this protein?"},
-    ],
-    [
-        {
-            "role": "system",
-            "content": "You are an AI expert that can answer any questions about protein.",
-        },
-        {"role": "user", "content": "What is the function of this protein?"},
-    ]
-]
-input_dict = processor(
-    protein_informations, messages_list, return_tensors="pt", text_max_length=512, protein_max_length=1024
-)
-with torch.no_grad():
-    generated_ids = hf_model.generate(**input_dict)
-generated_texts = processor.batch_decode(
-    generated_ids, skip_special_tokens=True
-)
-```
-
-Tips:
-
- This model was contributed by [Xibin Bayes Zhou](https://huggingface.co/XibinBayesZhou).
- The original code can be found [here](https://github.com/westlake-repl/Evolla).
-
-
-## EvollaConfig
-
-[[autodoc]] EvollaConfig
-
-## EvollaModel
-
-[[autodoc]] EvollaModel
-    - forward
-
-## EvollaForProteinText2Text
-
-[[autodoc]] EvollaForProteinText2Text
-    - forward
-
-## EvollaProcessor
-
-[[autodoc]] EvollaProcessor
-    - __call__
--- a/docs/source/en/model_doc/exaone4.md
+++ b/docs/source/en/model_doc/exaone4.md
@ -1,208 +0,0 @@
-<!--Copyright 2025 The LG AI Research and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# EXAONE 4
-
-## Overview
-
-**[EXAONE 4.0](https://github.com/LG-AI-EXAONE/EXAONE-4.0)** model is the language model, which integrates a **Non-reasoning mode** and **Reasoning mode** to achieve both the excellent usability of [EXAONE 3.5](https://github.com/LG-AI-EXAONE/EXAONE-3.5) and the advanced reasoning abilities of [EXAONE Deep](https://github.com/LG-AI-EXAONE/EXAONE-Deep). To pave the way for the agentic AI era, EXAONE 4.0 incorporates essential features such as agentic tool use, and its multilingual capabilities are extended
-to support Spanish in addition to English and Korean. 
-
-The EXAONE 4.0 model series consists of two sizes: a mid-size **32B** model optimized for high performance, and a small-size **1.2B** model designed for on-device applications.
-
-In the EXAONE 4.0 architecture, we apply new architectural changes compared to previous EXAONE models as below:
-
-1. **Hybrid Attention**: For the 32B model, we adopt hybrid attention scheme, which combines *Local attention (sliding window attention)* with *Global attention (full attention)* in a 3:1 ratio. We do not use RoPE (Rotary Positional Embedding) for global attention for better global context understanding.
-2. **QK-Reorder-Norm**: We reorder the LayerNorm position from the traditional Pre-LN scheme by applying LayerNorm directly to the attention and MLP outputs, and we add RMS normalization right after the Q and K projection. It helps yield better performance on downstream tasks despite consuming more computation.
-
-For more details, please refer to our [technical report](https://arxiv.org/abs/2507.11407), [HuggingFace paper](https://huggingface.co/papers/2507.11407), [blog](https://www.lgresearch.ai/blog/view?seq=576), and [GitHub](https://github.com/LG-AI-EXAONE/EXAONE-4.0).
-
-All model weights including quantized versions are available at [Huggingface Collections](https://huggingface.co/collections/LGAI-EXAONE/exaone-40-686b2e0069800c835ed48375).
-
-
-## Model Details
-
-### Model Specifications
-
-| Model Configuration | 32B | 1.2B |
-|:-------------------|:-----:|:------:|
-| d_model | 5,120 | 2,048 |
-| Number of layers | 64 | 30 |
-| Normalization | QK-Reorder-LN | QK-Reorder-LN |
-| Non-linearity | SwiGLU | SwiGLU |
-| Feedforward dimension | 27,392 | 4,096 |
-| Attention type | Hybrid (3:1 Local-Global) | Global |
-| Head type | GQA | GQA |
-| Number of heads | 40 | 32 |
-| Number of KV heads | 8 | 8 |
-| Head size | 128 | 64 |
-| Max sequence length | 131,072 | 65,536 |
-| RoPE theta | 1,000,000 | 1,000,000 |
-| Tokenizer | BBPE | BBPE |
-| Vocab size | 102,400 | 102,400 |
-| Tied word embedding | False | True |
-| Knowledge cut-off | Nov. 2024 | Nov. 2024 |
-
-
-## Usage tips
-
-### Non-reasoning mode
-
-For general use, you can use the EXAONE 4.0 models with the following example:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_name = "LGAI-EXAONE/EXAONE-4.0-32B"
-
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="bfloat16",
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-# choose your prompt
-prompt = "Explain how wonderful you are"
-prompt = "Explica lo increíble que eres"
-prompt = "너가 얼마나 대단한지 설명해 봐"
-
-messages = [
-    {"role": "user", "content": prompt}
-]
-input_ids = tokenizer.apply_chat_template(
-    messages,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt"
-)
-
-output = model.generate(
-    input_ids.to(model.device),
-    max_new_tokens=128,
-    do_sample=False,
-)
-print(tokenizer.decode(output[0]))
-```
-
-### Reasoning mode
-
-The EXAONE 4.0 models have reasoning capabilities for handling complex problems. You can activate reasoning mode by using the `enable_thinking=True` argument with the tokenizer, which opens a reasoning block that starts with `<think>` tag without closing it.
-
-```python
-messages = [
-    {"role": "user", "content": "Which one is bigger, 3.12 vs 3.9?"}
-]
-input_ids = tokenizer.apply_chat_template(
-    messages,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    enable_thinking=True,
-)
-
-output = model.generate(
-    input_ids.to(model.device),
-    max_new_tokens=128,
-    do_sample=True,
-    temperature=0.6,
-    top_p=0.95
-)
-print(tokenizer.decode(output[0]))
-```
-
-> [!IMPORTANT]
-> The model generation with reasoning mode can be affected sensitively by sampling parameters, so please refer to the [Usage Guideline](https://github.com/LG-AI-EXAONE/EXAONE-4.0#usage-guideline) on official GitHub page for better quality.
-
-### Agentic tool use
-
-The EXAONE 4.0 models can be used as agents with their tool calling capabilities. You can provide tool schemas to the model for effective tool calling.
-
-```python
-import random
-
-def roll_dice(max_num: int):
-    return random.randint(1, max_num)
-
-tools = [
-    {
-        "type": "function",
-        "function": {
-            "name": "roll_dice",
-            "description": "Roll a dice with the number 1 to N. User can select the number N.",
-            "parameters": {
-                "type": "object",
-                "required": ["max_num"],
-                "properties": {
-                    "max_num": {
-                        "type": "int",
-                        "description": "Max number of the dice"
-                    }
-                }
-            }
-        }
-    }
-]
-
-messages = [
-    {"role": "user", "content": "Roll D6 dice twice!"}
-]
-input_ids = tokenizer.apply_chat_template(
-    messages,
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    tools=tools,
-)
-
-output = model.generate(
-    input_ids.to(model.device),
-    max_new_tokens=1024,
-    do_sample=True,
-    temperature=0.6,
-    top_p=0.95,
-)
-print(tokenizer.decode(output[0]))
-```
-
-## Exaone4Config
-
-[[autodoc]] Exaone4Config
-
-## Exaone4Model
-
-[[autodoc]] Exaone4Model
-    - forward
-
-## Exaone4ForCausalLM
-
-[[autodoc]] Exaone4ForCausalLM
-    - forward
-
-## Exaone4ForSequenceClassification
-
-[[autodoc]] Exaone4ForSequenceClassification
-    - forward
-
-## Exaone4ForTokenClassification
-
-[[autodoc]] Exaone4ForTokenClassification
-    - forward
-
-## Exaone4ForQuestionAnswering
-
-[[autodoc]] Exaone4ForQuestionAnswering
-    - forward
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -110,13 +110,6 @@ outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

-## FalconMambaCache
-
-[[autodoc]] FalconMambaCache
-    - update_conv_state
-    - update_ssm_state
-    - reset
-
 ## FalconMambaConfig

 [[autodoc]] FalconMambaConfig
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@ -267,8 +267,3 @@ visualizer("<img>What is shown in this image?")

 [[autodoc]] Gemma3ForConditionalGeneration
    - forward
-
-## Gemma3ForSequenceClassification
-
-[[autodoc]] Gemma3ForSequenceClassification
-    - forward
--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@ -30,7 +30,7 @@ Gemma3n is a multimodal model with pretrained and instruction-tuned variants, av
 large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
 this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
 [MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
-a similar attention pattern to [Gemma 3](./gemma3) with alternating 4 local sliding window self-attention layers for
+a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
 every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
 [MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
 trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.
--- a/docs/source/en/model_doc/glm4_moe.md
+++ b/docs/source/en/model_doc/glm4_moe.md
@ -1,35 +0,0 @@
-<!--Copyright 2025 The ZhipuAI Inc. and The HuggingFace Inc. team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Glm4Moe
-
-## Overview
-
-This will update After model release.
-
-## Glm4MoeConfig
-
-[[autodoc]] Glm4MoeConfig
-
-## Glm4MoeModel
-
-[[autodoc]] Glm4MoeModel
-    - forward
-
-## Glm4MoeForCausalLM
-
-[[autodoc]] Glm4MoeForCausalLM
-    - forward
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@ -57,7 +57,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

-input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
+input_ids = tokenzier("Hello, I'm a language model". return_tensors="pt").to("cuda")

 output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/model_doc/granitemoehybrid.md
+++ b/docs/source/en/model_doc/granitemoehybrid.md
@ -48,32 +48,6 @@ for i in output:

 This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co/SukritiSharma) and [Alexander Brooks](https://huggingface.co/abrooks9944).

-## Notes
-
- `GraniteMoeHybridForCausalLM` supports padding-free training which concatenates distinct training examples while still processing inputs as separate batches. It can significantly accelerate inference by [~2x](https://github.com/huggingface/transformers/pull/35861#issue-2807873129) (depending on model and data distribution) and reduce memory-usage if there are examples of varying lengths by avoiding unnecessary compute and memory overhead from padding tokens.
-
-  Padding-free training requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d` packages and the following arguments must be passed to the model in addition to `input_ids` and `labels`.
-
-  - `position_ids: torch.LongTensor`: the position index of each token in each sequence.
-  - `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
-  - Each of the [`FlashAttentionKwargs`]
-    - `cu_seq_lens_q: torch.LongTensor`: the cumulative sequence lengths of all queries.
-    - `cu_seq_lens_k: torch.LongTensor`: the cumulative sequence lengths of all keys.
-    - `max_length_q: int`: the longest query length in the batch.
-    - `max_length_k: int`: the longest key length in the batch.
-
-  The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] programmatically generates the set of additional arguments above using `return_seq_idx=True` and `return_flash_attn_kwargs=True`. See the [Improving Hugging Face Training Efficiency Through Packing with Flash Attention](https://huggingface.co/blog/packing-with-FA2) blog post for additional information.
-
-  ```python
-  from transformers import DataCollatorWithFlattening
-
-  # Example of using padding-free training
-  data_collator = DataCollatorWithFlattening(
-      tokenizer=tokenizer,
-      return_seq_idx=True,
-      return_flash_attn_kwargs=True
-  )
-  ```

 ## GraniteMoeHybridConfig

@ -87,4 +61,4 @@ This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co
 ## GraniteMoeHybridForCausalLM

 [[autodoc]] GraniteMoeHybridForCausalLM
-    - forward
+    - forward
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@ -169,9 +169,9 @@ model = Idefics2ForConditionalGeneration.from_pretrained(

 ## Shrinking down Idefics2 using quantization

-As the Idefics2 model has 8 billion parameters, that would require about 16GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), that requires only about 3.5GB of RAM.
+As the Idefics2 model has 8 billion parameters, that would require about 16GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), that requires only about 3.5GB of RAM.

-Quantizing a model is as simple as passing a `quantization_config` to the model. One can change the code snippet above with the changes below. We'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization) for other quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. One can change the code snippet above with the changes below. We'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization.md) for other quantization methods):

 ```diff
 + from transformers import BitsAndBytesConfig
@ -193,7 +193,7 @@ model = Idefics2ForConditionalGeneration.from_pretrained(

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Idefics2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

- A notebook on how to fine-tune Idefics2 on a custom dataset using the [Trainer](../main_classes/trainer) can be found [here](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing). It supports both full fine-tuning as well as (quantized) LoRa.
+- A notebook on how to fine-tune Idefics2 on a custom dataset using the [Trainer](../main_classes/trainer.md) can be found [here](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing). It supports both full fine-tuning as well as (quantized) LoRa.
 - A script regarding how to fine-tune Idefics2 using the TRL library can be found [here](https://gist.github.com/edbeeching/228652fc6c2b29a1641be5a5778223cb).
 - Demo notebook regarding fine-tuning Idefics2 for JSON extraction use cases can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Idefics2). 🌎

--- a/docs/source/en/model_doc/ijepa.md
+++ b/docs/source/en/model_doc/ijepa.md
@ -1,4 +1,4 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,107 +14,53 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
 # I-JEPA

-[I-JEPA](https://huggingface.co/papers/2301.08243) is a self-supervised learning method that learns semantic image representations by predicting parts of an image from other parts of the image. It compares the abstract representations of the image (rather than pixel level comparisons), which avoids the typical pitfalls of data augmentation bias and pixel-level details that don't capture semantic meaning.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find the original I-JEPA checkpoints under the [AI at Meta](https://huggingface.co/facebook/models?search=ijepa) organization.
-> [!TIP]
-> This model was contributed by [jmtzt](https://huggingface.co/jmtzt).
+## Overview

+The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://huggingface.co/papers/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
+I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations.

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/ijepa_architecture.jpg">
+The abstract from the paper is the following:

+This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction.

-> Click on the I-JEPA models in the right sidebar for more examples of how to apply I-JEPA to different image representation and classification tasks.
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/ijepa_architecture.jpg"
+alt="drawing" width="600"/>

-The example below demonstrates how to extract image features with [`Pipeline`] or the [`AutoModel`] class.
+<small> I-JEPA architecture. Taken from the <a href="https://huggingface.co/papers/2301.08243">original paper.</a> </small>

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+This model was contributed by [jmtzt](https://huggingface.co/jmtzt).
+The original code can be found [here](https://github.com/facebookresearch/ijepa).

-```py
-import torch
-from transformers import pipeline
-feature_extractor = pipeline(
-    task="image-feature-extraction",
-    model="facebook/ijepa_vith14_1k",
-    device=0,
-    torch_dtype=torch.bfloat16
-)
-features = feature_extractor("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", return_tensors=True)  
+## How to use

-print(f"Feature shape: {features.shape}")
+Here is how to use this model for image feature extraction:

-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```py
+```python
 import requests
 import torch
 from PIL import Image
 from torch.nn.functional import cosine_similarity
-from transformers import AutoModel, AutoProcessor  

-url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"  
-url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
-image_1 = Image.open(requests.get(url_1, stream=True).raw)
-image_2 = Image.open(requests.get(url_2, stream=True).raw)
-
-processor = AutoProcessor.from_pretrained("facebook/ijepa_vith14_1k")  
-model = AutoModel.from_pretrained("facebook/ijepa_vith14_1k", torch_dtype="auto", attn_implementation="sdpa")  
-
-
-def infer(image):  
-    inputs = processor(image, return_tensors="pt")  
-    outputs = model(**inputs)  
-    return outputs.last_hidden_state.mean(dim=1)  
-
-
-embed_1 = infer(image_1)  
-embed_2 = infer(image_2)  
-
-similarity = cosine_similarity(embed_1, embed_2)  
-print(similarity)
-```
-</hfoption>
-</hfoptions>
-
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
-
-```py
-import torch
-from transformers import BitsAndBytesConfig, AutoModel, AutoProcessor
-from datasets import load_dataset
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
-)
+from transformers import AutoModel, AutoProcessor

 url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
 url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
 image_1 = Image.open(requests.get(url_1, stream=True).raw)
 image_2 = Image.open(requests.get(url_2, stream=True).raw)

-processor = AutoProcessor.from_pretrained("facebook/ijepa_vitg16_22k")
-model = AutoModel.from_pretrained("facebook/ijepa_vitg16_22k", quantization_config=quantization_config, torch_dtype="auto", attn_implementation="sdpa")
-
+model_id = "facebook/ijepa_vith14_1k"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModel.from_pretrained(model_id)

+@torch.no_grad()
 def infer(image):
    inputs = processor(image, return_tensors="pt")
    outputs = model(**inputs)
@ -128,6 +74,15 @@ similarity = cosine_similarity(embed_1, embed_2)
 print(similarity)
 ```

+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
 ## IJepaConfig

 [[autodoc]] IJepaConfig
@ -140,5 +95,4 @@ print(similarity)
 ## IJepaForImageClassification

 [[autodoc]] IJepaForImageClassification
-    - forward
-
+    - forward
--- a/docs/source/en/model_doc/janus.md
+++ b/docs/source/en/model_doc/janus.md
@ -44,11 +44,11 @@ Here is the example of visual understanding with a single image.
 > Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts.

 ```python
-import torch
-from PIL import Image
-import requests
+import torch  
+from PIL import Image  
+import requests  

-from transformers import JanusForConditionalGeneration, JanusProcessor
+from transformers import JanusForConditionalGeneration, JanusProcessor  

 model_id = "deepseek-community/Janus-Pro-1B"
 # Prepare Input for generation.
@ -64,7 +64,7 @@ messages = [

 # Set generation mode to `text` to perform text generation.
 processor = JanusProcessor.from_pretrained(model_id)
-model = JanusForConditionalGeneration.from_pretrained(model_id,
+model = JanusForConditionalGeneration.from_pretrained(model_id,     
        torch_dtype=torch.bfloat16,
        device_map="auto")

@ -209,10 +209,6 @@ for i, image in enumerate(images['pixel_values']):

 [[autodoc]] JanusImageProcessor

-## JanusImageProcessorFast
-
-[[autodoc]] JanusImageProcessorFast
-
 ## JanusVisionModel

 [[autodoc]] JanusVisionModel
--- a/docs/source/en/model_doc/lfm2.md
+++ b/docs/source/en/model_doc/lfm2.md
@ -1,84 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-# LFM2
-
-## Overview
-
-[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) represents a new generation of Liquid Foundation Models developed by [Liquid AI](https://liquid.ai/), specifically designed for edge AI and on-device deployment. 
-
-The models are available in three sizes (350M, 700M, and 1.2B parameters) and are engineered to run efficiently on CPU, GPU, and NPU hardware, making them particularly well-suited for applications requiring low latency, offline operation, and privacy.
-
-## Architecture
-
-The architecture consists of 16 blocks total: 10 double-gated short-range convolution blocks and 6 blocks of grouped query attention. This design stems from the concept of dynamical systems, where linear operations are modulated by input-dependent gates, allowing for "liquid" dynamics that can adapt in real-time. The short convolutions are particularly optimized for embedded SoC CPUs, making them ideal for devices that require fast, local inference without relying on cloud connectivity.
-
-The key architectural innovation of LFM2 lies in its systematic approach to balancing quality, latency, and memory efficiency through our STAR neural architecture search engine. Using STAR, Liquid AI optimized the models for real-world performance on embedded hardware, measuring actual peak memory usage and inference speed on Qualcomm Snapdragon processors. This results in models that achieve 2x faster decode and prefill performance compared to similar-sized models, while maintaining superior benchmark performance across knowledge, mathematics, instruction following, and multilingual tasks.
-
-## Example
-
-The following example shows how to generate an answer using the `AutoModelForCausalLM` class.
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-# Load model and tokenizer
-model_id = "LiquidAI/LFM2-1.2B"
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype="bfloat16",
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-# Generate answer
-prompt = "What is C. elegans?"
-input_ids = tokenizer.apply_chat_template(
-    [{"role": "user", "content": prompt}],
-    add_generation_prompt=True,
-    return_tensors="pt",
-    tokenize=True,
-)
-
-output = model.generate(
-    input_ids,
-    do_sample=True,
-    temperature=0.3,
-    min_p=0.15,
-    repetition_penalty=1.05,
-    max_new_tokens=512,
-)
-
-print(tokenizer.decode(output[0], skip_special_tokens=False))
-```
-
-## Lfm2Config
-
-[[autodoc]] Lfm2Config
-
-## Lfm2Model
-
-[[autodoc]] Lfm2Model
-    - forward
-
-## Lfm2ForCausalLM
-
-[[autodoc]] Lfm2ForCausalLM
-    - forward
--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@ -10,31 +10,37 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

-->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-    </div>
-</div>
+-->

 # LightGlue

-[LightGlue](https://arxiv.org/abs/2306.13643) is a deep neural network that learns to match local features across images. It revisits multiple design decisions of SuperGlue and derives simple but effective improvements. Cumulatively, these improvements make LightGlue more efficient - in terms of both memory and computation, more accurate, and much easier to train. Similar to [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor), this model consists of matching two sets of local features extracted from two images, with the goal of being faster than SuperGlue. Paired with the [SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and estimate the pose between them.
+## Overview

-You can find all the original LightGlue checkpoints under the [ETH-CVG](https://huggingface.co/ETH-CVG) organization.
+The LightGlue model was proposed in [LightGlue: Local Feature Matching at Light Speed](https://arxiv.org/abs/2306.13643)
+by Philipp Lindenberger, Paul-Edouard Sarlin and Marc Pollefeys.

-> [!TIP]
-> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
->
-> Click on the LightGlue models in the right sidebar for more examples of how to apply LightGlue to different computer vision tasks.
+Similar to [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor), this model consists of matching
+two sets of local features extracted from two images, its goal is to be faster than SuperGlue. Paired with the 
+[SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and 
+estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc.

-The example below demonstrates how to match keypoints between two images with the [`AutoModel`] class.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="AutoModel">
+*We introduce LightGlue, a deep neural network that learns to match local features across images. We revisit multiple
+design decisions of SuperGlue, the state of the art in sparse matching, and derive simple but effective improvements. 
+Cumulatively, they make LightGlue more efficient - in terms of both memory and computation, more accurate, and much
+easier to train. One key property is that LightGlue is adaptive to the difficulty of the problem: the inference is much
+faster on image pairs that are intuitively easy to match, for example because of a larger visual overlap or limited
+appearance change. This opens up exciting prospects for deploying deep matchers in latency-sensitive applications like
+3D reconstruction. The code and trained models are publicly available at this [https URL](https://github.com/cvg/LightGlue)*

-```py
+## How to use
+
+Here is a quick example of using the model. Since this model is an image matching model, it requires pairs of images to be matched. 
+The raw outputs contain the list of keypoints detected by the keypoint detector as well as the list of matches with their corresponding 
+matching scores.
+```python
 from transformers import AutoImageProcessor, AutoModel
 import torch
 from PIL import Image
@ -53,70 +59,31 @@ model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
 inputs = processor(images, return_tensors="pt")
 with torch.no_grad():
    outputs = model(**inputs)
-
-# Post-process to get keypoints and matches
-image_sizes = [[(image.height, image.width) for image in images]]
-processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
 ```

-</hfoption>
-</hfoptions>
+You can use the `post_process_keypoint_matching` method from the `LightGlueImageProcessor` to get the keypoints and matches in a readable format:
+```python
+image_sizes = [[(image.height, image.width) for image in images]]
+outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+for i, output in enumerate(outputs):
+    print("For the image pair", i)
+    for keypoint0, keypoint1, matching_score in zip(
+            output["keypoints0"], output["keypoints1"], output["matching_scores"]
+    ):
+        print(
+            f"Keypoint at coordinate {keypoint0.numpy()} in the first image matches with keypoint at coordinate {keypoint1.numpy()} in the second image with a score of {matching_score}."
+        )
+```

-## Notes
+You can visualize the matches between the images by providing the original images as well as the outputs to this method:
+```python
+processor.plot_keypoint_matching(images, outputs)
+```

- LightGlue is adaptive to the task difficulty. Inference is much faster on image pairs that are intuitively easy to match, for example, because of a larger visual overlap or limited appearance change.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/duPp09ty8NRZlMZS18ccP.png)

-    ```py
-    from transformers import AutoImageProcessor, AutoModel
-    import torch
-    from PIL import Image
-    import requests
-    
-    processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
-    model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
-    
-    # LightGlue requires pairs of images
-    images = [image1, image2]
-    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
-    
-    # Extract matching information
-    keypoints0 = outputs.keypoints0  # Keypoints in first image
-    keypoints1 = outputs.keypoints1  # Keypoints in second image
-    matches = outputs.matches        # Matching indices
-    matching_scores = outputs.matching_scores  # Confidence scores
-    ```
-
- The model outputs matching indices, keypoints, and confidence scores for each match, similar to SuperGlue but with improved efficiency.
- For better visualization and analysis, use the [`LightGlueImageProcessor.post_process_keypoint_matching`] method to get matches in a more readable format.
-
-    ```py
-    # Process outputs for visualization
-    image_sizes = [[(image.height, image.width) for image in images]]
-    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-    
-    for i, output in enumerate(processed_outputs):
-        print(f"For the image pair {i}")
-        for keypoint0, keypoint1, matching_score in zip(
-                output["keypoints0"], output["keypoints1"], output["matching_scores"]
-        ):
-            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
-    ```
-
- Visualize the matches between the images using the built-in plotting functionality.
-
-    ```py
-    # Easy visualization using the built-in plotting method
-    processor.visualize_keypoint_matching(images, processed_outputs)
-    ```
-
-<div class="flex justify-center">
-    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/duPp09ty8NRZlMZS18ccP.png">
-</div>
-
-## Resources
-
- Refer to the [original LightGlue repository](https://github.com/cvg/LightGlue) for more examples and implementation details.
+This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+The original code can be found [here](https://github.com/cvg/LightGlue).

 ## LightGlueConfig

@ -128,15 +95,10 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size

 - preprocess
 - post_process_keypoint_matching
- visualize_keypoint_matching
+- plot_keypoint_matching

-<frameworkcontent>
-<pt>
 ## LightGlueForKeypointMatching

 [[autodoc]] LightGlueForKeypointMatching

 - forward
-
-</pt>
-</frameworkcontent>
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -14,178 +14,287 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-  <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-  </div>
-</div>
-
 # LLaVA-NeXT

-[LLaVA‑NeXT](https://llava-vl.github.io/blog/2024-05-10-llava-next-stronger-llms/) improves on [Llava](./llava) by increasing the input image resolution by 4x more pixels and supporting 3 aspect ratios (up to 672x672, 336x1344, 1344x336) to better grasp visual details. It is also trained on an improved visual instruction tuning dataset covering more scenarios and applications to improve OCR and common sense reasoning.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original LLaVA‑NeXT checkpoints under the [LLaVA-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf) collection.
+## Overview

-> [!TIP]
-> This model was contributed by [nielsr](https://huggingface.co/nielsr).
->
-> Click on the LLaVA‑NeXT models in the right sidebar for more examples of how to apply Llava-NeXT to different multimodal tasks.
+The LLaVA-NeXT model was proposed in [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon [LLaVa](llava) by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning.

-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+The introduction from the blog is the following:

-<hfoptions id="usage">
+*In October 2023, we released LLaVA-1.5 with a simple and efficient design along with great performance on a benchmark suite of 12 datasets. It has since served as the foundation of many comprehensive studies of data, model, and capabilities of large multimodal models (LMM), and has enabled various new applications.

-<hfoption id="Pipeline">
+Today, we are thrilled to present LLaVA-NeXT, with improved reasoning, OCR, and world knowledge. LLaVA-NeXT even exceeds Gemini Pro on several benchmarks.
+
+Compared with LLaVA-1.5, LLaVA-NeXT has several improvements:
+
+Increasing the input image resolution to 4x more pixels. This allows it to grasp more visual details. It supports three aspect ratios, up to 672x672, 336x1344, 1344x336 resolution.
+Better visual reasoning and OCR capability with an improved visual instruction tuning data mixture.
+Better visual conversation for more scenarios, covering different applications. Better world knowledge and logical reasoning.
+Efficient deployment and inference with SGLang.
+Along with performance improvements, LLaVA-NeXT maintains the minimalist design and data efficiency of LLaVA-1.5. It re-uses the pretrained connector of LLaVA-1.5, and still uses less than 1M visual instruction tuning samples. The largest 34B variant finishes training in ~1 day with 32 A100s.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_overview.png"
+alt="drawing" width="600"/>
+
+<small> LLaVa-NeXT incorporates a higher input resolution by encoding various patches of the input image. Taken from the <a href="https://huggingface.co/papers/2310.03744">original paper.</a> </small>
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/main).
+
+## Usage tips
+
+- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
+
+<Tip warning={true}>
+
+- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
+
+</Tip>
+
+
+> [!NOTE]
+> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
+Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
+The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
+
+
+### Formatting Prompts with Chat Templates  
+
+Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method.  
+
+**Important:**  
+- You must construct a conversation history — passing a plain string won't work.  
+- Each message should be a dictionary with `"role"` and `"content"` keys.  
+- The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`.  
+
+
+Here’s an example of how to structure your input. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image.

 ```python
-import torch  
-from transformers import pipeline  
+from transformers import LlavaNextProcessor

-pipeline = pipeline(  
-    task="image-text-to-text",  
-    model="llava-hf/llava-v1.6-mistral-7b-hf",  
-    device=0,  
-    torch_dtype=torch.bfloat16  
-)  
-messages = [  
-    {  
-        "role": "user",  
-        "content": [  
-            {  
-                "type": "image",  
-                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",  
-            },  
-            { "type": "text", "text": "Describe this image."},  
-        ]  
-    }  
-]  
-pipeline(text=messages, max_new_tokens=20, return_full_text=False)
+processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "What’s shown in this image?"},
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [{"type": "text", "text": "This image shows a red stop sign."},]
+    },
+    {
+
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe the image in more details."},
+        ],
+    },
+]
+
+text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
+print(text_prompt)
+>>> "[INST] <image>\nWhat's shown in this image? [/INST] This image shows a red stop sign. [INST] Describe the image in more details. [/INST]"
 ```

-</hfoption>
-
-<hfoption id="AutoModel">
-
-```python
-import torch  
-import requests  
-from PIL import Image  
-from transformers import AutoProcessor, LlavaNextForConditionalGeneration  
-
-processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")  
-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16).to("cuda")  
-
-url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"  
-image = Image.open(requests.get(url, stream=True).raw)  
-
-conversation = [  
-    {  
-        "role": "user",  
-        "content": [  
-            {"type": "image"},  
-            {"type": "text", "text": "What is shown in this image?"},  
-        ],  
-    },  
-]  
-prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)  
-inputs = processor(image, prompt, return_tensors="pt").to("cuda")  
-output = model.generate(**inputs, max_new_tokens=100)  
-print(processor.decode(output[0], skip_special_tokens=True))  
+- If you want to construct a chat prompt yourself, below is a list of possible formats
+.
+[llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) requires the following format:
+```bash
+"[INST] <image>\nWhat is shown in this image? [/INST]"
 ```

-</hfoption>
-
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
-
-```python
-import torch  
-import requests  
-from PIL import Image  
-from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig  
-
-quant_config = BitsAndBytesConfig(  
-    load_in_4bit=True,  
-    bnb_4bit_compute_dtype=torch.float16,  
-    bnb_4bit_quant_type="nf4"  
-)  
-
-processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")  
-model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quant_config, device_map="auto")  
-
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"  
-image = Image.open(requests.get(url, stream=True).raw)  
-
-conversation = [  
-    {  
-        "role": "user",  
-        "content": [  
-            {"type": "image"},  
-            {"type": "text", "text": "What does this chart show?"},  
-        ],  
-    },  
-]  
-prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)  
-inputs = processor(image, prompt, return_tensors="pt").to("cuda")  
-
-with torch.inference_mode():  
-    output = model.generate(**inputs, max_new_tokens=100)  
-print(processor.decode(output[0], skip_special_tokens=True))  
+[llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) and [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) require the following format:
+```bash
+"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
 ```

-
-## Notes
-
-* Different checkpoints (Mistral, Vicuna, etc.) require a specific prompt format depending on the underlying LLM. Always use [`~ProcessorMixin.apply_chat_template`] to ensure correct formatting. Refer to the [Templates](../chat_templating) guide for more details.
-
-* Set `padding_side="left"` during batched generation for more accurate results.
-
-```py
-processor.tokenizer.padding_side = "left"
+[llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) requires the following format:
+```bash
+"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
 ```

-* LLaVA-NeXT uses different numbers of patches for images and pads the inputs inside the modeling code except when padding is done during processing. The default setting is *left-padding* if the model is in `eval()` mode, otherwise it is *right-padding*.
+[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:

-* LLaVA models after v4.46 raises warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}`, and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add these attributes to the processor if you own the model checkpoint or open a PR if it isn't.
+```bash
+"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+```

-  Adding these attributes means LLaVA will try to infer the number of image tokens required per image and expand the text with the same number of `<image>` token placeholders. There are usually ~500 tokens per image, so make sure the text is not truncated because it will cause a failure when merging the embeddings. The attributes can be found in `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`.
+[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format:

-  The `num_additional_image_tokens` should be `1` if the vision backbone adds a `CLS` token or `0` if nothing extra is added.
+```bash
+"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
+```

-* The example below demonstrates inference with multiple input images.
+🚀 **Bonus:** If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it.
+
+
+
+## Usage example
+
+### Single image inference
+
+Here's how to load the model and perform inference in half-precision (`torch.float16`):

 ```python
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+import torch
 from PIL import Image
-import requests, torch
+import requests

 processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-model = LlavaNextForConditionalGeneration.from_pretrained(
-    "llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16
-).to("cuda")

-# Load multiple images
-url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"
-url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_comparison.png"
+model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16)
+model.to("cuda:0")

-image1 = Image.open(requests.get(url1, stream=True).raw)
-image2 = Image.open(requests.get(url2, stream=True).raw)
+# prepare image and text prompt, using the appropriate prompt template
+url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
+image = Image.open(requests.get(url, stream=True).raw)

 conversation = [
-    {"role": "user", "content": [{"type": "image"}, {"type": "image"}, {"type": "text", "text": "Compare these two images and describe the differences."}]}
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ],
+    },
 ]
 prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-inputs = processor([image1, image2], prompt, return_tensors="pt").to("cuda")
+inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")

+# autoregressively complete prompt
 output = model.generate(**inputs, max_new_tokens=100)
+
 print(processor.decode(output[0], skip_special_tokens=True))
 ```

+### Multi image inference
+
+LLaVa-Next can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it:
+
+```python
+import requests
+from PIL import Image
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+
+# Load the model in half-precision
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+
+# Get three different images
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image_stop = Image.open(requests.get(url, stream=True).raw)
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image_cats = Image.open(requests.get(url, stream=True).raw)
+
+url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+image_snowman = Image.open(requests.get(url, stream=True).raw)
+
+# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
+conversation_1 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "What is shown in this image?"},
+            ],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {"type": "text", "text": "There is a red stop sign in the image."},
+            ],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "What about this image? How many cats do you see?"},
+            ],
+    },
+]
+
+conversation_2 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": "What is shown in this image?"},
+            ],
+    },
+]
+
+prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
+prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
+prompts = [prompt_1, prompt_2]
+
+# We can simply feed images in the order they have to be used in the text prompt
+# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
+inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device)
+
+# Generate
+generate_ids = model.generate(**inputs, max_new_tokens=30)
+processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+```
+
+## Model optimization
+
+### Quantization using Bitsandbytes
+
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`, and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Simply change the snippet above with:
+
+```python
+from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
+
+# specify how to quantize the model
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
+```
+
+### Use Flash-Attention 2 to further speed-up generation
+
+First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
+
+```python
+from transformers import AutoModelForImageTextToText
+
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16,
+    use_flash_attention_2=True
+).to(0)
+```

 ## LlavaNextConfig

--- a/docs/source/en/model_doc/mamba.md
+++ b/docs/source/en/model_doc/mamba.md
@ -28,7 +28,6 @@ You can find all the original Mamba checkpoints under the [State Space Models](h


 > [!TIP]
-> This model was contributed by [Molbap](https://huggingface.co/Molbap) and [AntonV](https://huggingface.co/AntonV).
 > Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.

 The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
@ -116,13 +115,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
  trainer.train()
   ```

-## MambaCache
-
-[[autodoc]] MambaCache
-    - update_conv_state
-    - update_ssm_state
-    - reset
-
 ## MambaConfig

 [[autodoc]] MambaConfig
--- a/docs/source/en/model_doc/mamba2.md
+++ b/docs/source/en/model_doc/mamba2.md
@ -26,7 +26,6 @@ rendered properly in your Markdown viewer.
 You can find all the original Mamba 2 checkpoints under the [State Space Models](https://huggingface.co/state-spaces) organization, but the examples shown below use [mistralai/Mamba-Codestral-7B-v0.1](https://huggingface.co/mistralai/Mamba-Codestral-7B-v0.1) because a Hugging Face implementation isn't supported yet for the original checkpoints.

 > [!TIP]
-> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ).
 > Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.

 The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
--- a/docs/source/en/model_doc/marian.md
+++ b/docs/source/en/model_doc/marian.md
@ -14,139 +14,160 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
 # MarianMT

-
-
-[MarianMT](https://huggingface.co/papers/1804.00344) is a machine translation model trained with the Marian framework which is written in pure C++. The framework includes its own custom auto-differentiation engine and efficient meta-algorithms to train encoder-decoder models like BART.
-
-All MarianMT models are transformer encoder-decoders with 6 layers in each component, use static sinusoidal positional embeddings, don't have a layernorm embedding, and the model starts generating with the prefix `pad_token_id` instead of `<s/>`.
-
-
-
-You can find all the original MarianMT checkpoints under the [Language Technology Research Group at the University of Helsinki](https://huggingface.co/Helsinki-NLP/models?search=opus-mt) organization.
-
-
-> [!TIP]
-> This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
->
-> Click on the MarianMT models in the right sidebar for more examples of how to apply MarianMT to translation tasks.
-
-
-The example below demonstrates how to translate text using [`Pipeline`] or the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```python
-
-import torch
-from transformers import pipeline
-
-pipeline = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, device=0)
-pipeline("Hello, how are you?")
-
-```
-
-</hfoption>
-
-<hfoption id="AutoModel">
-
-```python
-
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
-model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, attn_implementation="sdpa", device_map="auto")
-
-inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
-outputs = model.generate(**inputs, cache_implementation="static")
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-
-```
-
-</hfoption>
-</hfoptions>
-
-
-Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
-
-```python
-from transformers.utils.attention_visualizer import AttentionMaskVisualizer
-
-visualizer = AttentionMaskVisualizer("Helsinki-NLP/opus-mt-en-de")
-visualizer("Hello, how are you?")
-```
-<div class="flex justify-center">
-   <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/marianmt-attn-mask.png"/>
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

-## Notes
+## Overview

- MarianMT models are ~298MB on disk and there are more than 1000 models. Check this [list](https://huggingface.co/Helsinki-NLP) for supported language pairs. The language codes may be inconsistent. Two digit codes can be found [here](https://developers.google.com/admin-sdk/directory/v1/languages) while three digit codes may require further searching.
- Models that require BPE preprocessing are not supported.
- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`. Language codes formatted like `es_AR` usually refer to the `code_{region}`. For example, `es_AR` refers to Spanish from Argentina.
- If a model can output multiple languages, prepend the desired output language to `src_txt` as shown below. New multilingual models from the [Tatoeba-Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge) require 3 character language codes.
+A framework for translation models, using the same models as BART. Translations should be similar, but not identical to output in the test set linked to in each model card.
+This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
+
+
+## Implementation Notes
+
+- Each model is about 298 MB on disk, there are more than 1,000 models.
+- The list of supported language pairs can be found [here](https://huggingface.co/Helsinki-NLP).
+- Models were originally trained by [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann) using the [Marian](https://marian-nmt.github.io/) C++ library, which supports fast training and translation.
+- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented
+  in a model card.
+- The 80 opus models that require BPE preprocessing are not supported.
+- The modeling code is the same as [`BartForConditionalGeneration`] with a few minor modifications:
+
+  - static (sinusoid) positional embeddings (`MarianConfig.static_position_embeddings=True`)
+  - no layernorm_embedding (`MarianConfig.normalize_embedding=False`)
+  - the model starts generating with `pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
+    `<s/>`),
+- Code to bulk convert models can be found in `convert_marian_to_pytorch.py`.
+
+
+## Naming
+
+- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`
+- The language codes used to name models are inconsistent. Two digit codes can usually be found [here](https://developers.google.com/admin-sdk/directory/v1/languages), three digit codes require googling "language
+  code {code}".
+- Codes formatted like `es_AR` are usually `code_{region}`. That one is Spanish from Argentina.
+- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
+  group use a combination of ISO-639-5 codes and ISO-639-2 codes.
+
+
+## Examples
+
+- Since Marian models are smaller than many other translation models available in the library, they can be useful for
+  fine-tuning experiments and integration tests.
+- [Fine-tune on GPU](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh)
+
+## Multilingual Models
+
+- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`:
+- If a model can output multiple languages, and you should specify a language code by prepending the desired output
+  language to the `src_text`.
+- You can see a models's supported language codes in its model card, under target constituents, like in [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa).
+- Note that if a model is only multilingual on the source side, like `Helsinki-NLP/opus-mt-roa-en`, no language
+  codes are required.
+
+New multi-lingual models from the [Tatoeba-Challenge repo](https://github.com/Helsinki-NLP/Tatoeba-Challenge)
+require 3 character language codes:

 ```python
+>>> from transformers import MarianMTModel, MarianTokenizer

-from transformers import MarianMTModel, MarianTokenizer
+>>> src_text = [
+...     ">>fra<< this is a sentence in english that we want to translate to french",
+...     ">>por<< This should go to portuguese",
+...     ">>esp<< And this to Spanish",
+... ]

-# Model trained on multiple source languages → multiple target languages
-# Example: multilingual to Arabic (arb)
-model_name = "Helsinki-NLP/opus-mt-mul-mul"  # Tatoeba Challenge model
-tokenizer = MarianTokenizer.from_pretrained(model_name)
-model = MarianMTModel.from_pretrained(model_name)
-
-# Prepend the desired output language code (3-letter ISO 639-3)
-src_texts = ["arb>> Hello, how are you today?"]
-
-# Tokenize and translate
-inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
-translated = model.generate(**inputs)
-
-# Decode and print result
-translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
-print(translated_texts[0])
+>>> model_name = "Helsinki-NLP/opus-mt-en-roa"
+>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+>>> print(tokenizer.supported_language_codes)
+['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']

+>>> model = MarianMTModel.from_pretrained(model_name)
+>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
+>>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+["c'est une phrase en anglais que nous voulons traduire en français",
+ 'Isto deve ir para o português.',
+ 'Y esto al español']
 ```
-   
- Older multilingual models use 2 character language codes.
+
+Here is the code to see all available pretrained models on the hub:

 ```python
+from huggingface_hub import list_models

-from transformers import MarianMTModel, MarianTokenizer
-
-# Example: older multilingual model (like en → many)
-model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"  # English → French, Spanish, Italian, etc.
-tokenizer = MarianTokenizer.from_pretrained(model_name)
-model = MarianMTModel.from_pretrained(model_name)
-
-# Prepend the 2-letter ISO 639-1 target language code (older format)
-src_texts = [">>fr<< Hello, how are you today?"]
-
-# Tokenize and translate
-inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
-translated = model.generate(**inputs)
-
-# Decode and print result
-translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
-print(translated_texts[0])
-
+model_list = list_models()
+org = "Helsinki-NLP"
+model_ids = [x.id for x in model_list if x.id.startswith(org)]
+suffix = [x.split("/")[1] for x in model_ids]
+old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
 ```

+## Old Style Multi-Lingual Models
+
+These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
+group:
+
+```python no-style
+['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
+ 'Helsinki-NLP/opus-mt-ROMANCE-en',
+ 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
+ 'Helsinki-NLP/opus-mt-de-ZH',
+ 'Helsinki-NLP/opus-mt-en-CELTIC',
+ 'Helsinki-NLP/opus-mt-en-ROMANCE',
+ 'Helsinki-NLP/opus-mt-es-NORWAY',
+ 'Helsinki-NLP/opus-mt-fi-NORWAY',
+ 'Helsinki-NLP/opus-mt-fi-ZH',
+ 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
+ 'Helsinki-NLP/opus-mt-sv-NORWAY',
+ 'Helsinki-NLP/opus-mt-sv-ZH']
+GROUP_MEMBERS = {
+ 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+ 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+ 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+ 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+ 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+ 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+ 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+}
+```
+
+Example of translating english to many romance languages, using old-style 2 character language codes
+
+
+```python
+>>> from transformers import MarianMTModel, MarianTokenizer
+
+>>> src_text = [
+...     ">>fr<< this is a sentence in english that we want to translate to french",
+...     ">>pt<< This should go to portuguese",
+...     ">>es<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
+>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+
+>>> model = MarianMTModel.from_pretrained(model_name)
+>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
+>>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+["c'est une phrase en anglais que nous voulons traduire en français",
+ 'Isto deve ir para o português.',
+ 'Y esto al español']
+```
+
+## Resources
+
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)
+- [Causal language modeling task guide](../tasks/language_modeling)
+
 ## MarianConfig

 [[autodoc]] MarianConfig
--- a/docs/source/en/model_doc/mask2former.md
+++ b/docs/source/en/model_doc/mask2former.md
@ -77,12 +77,4 @@ The resource should ideally demonstrate something new instead of duplicating an
    - encode_inputs
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## Mask2FormerImageProcessorFast
-
-[[autodoc]] Mask2FormerImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
--- a/docs/source/en/model_doc/maskformer.md
+++ b/docs/source/en/model_doc/maskformer.md
@ -76,14 +76,6 @@ This model was contributed by [francesco](https://huggingface.co/francesco). The
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation

-## MaskFormerImageProcessorFast
-
-[[autodoc]] MaskFormerImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
 ## MaskFormerFeatureExtractor

 [[autodoc]] MaskFormerFeatureExtractor
--- a/docs/source/en/model_doc/mgp-str.md
+++ b/docs/source/en/model_doc/mgp-str.md
@ -33,7 +33,7 @@ alt="drawing" width="600"/>

 <small> MGP-STR architecture. Taken from the <a href="https://huggingface.co/papers/2209.03592">original paper</a>. </small>

-MGP-STR is trained on two synthetic datasets [MJSynth](http://www.robots.ox.ac.uk/~vgg/data/text/) (MJ) and [SynthText](http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
+MGP-STR is trained on two synthetic datasets [MJSynth]((http://www.robots.ox.ac.uk/~vgg/data/text/)) (MJ) and [SynthText](http://www.robots.ox.ac.uk/~vgg/data/scenetext/) (ST) without fine-tuning on other datasets. It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE).
 This model was contributed by [yuekun](https://huggingface.co/yuekun). The original code can be found [here](https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR).

 ## Inference example
--- a/docs/source/en/model_doc/mimi.md
+++ b/docs/source/en/model_doc/mimi.md
@ -30,7 +30,7 @@ The abstract from the paper is the following:

 *We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* 

-Its architecture is based on [Encodec](./encodec) with several major differences:
+Its architecture is based on [Encodec](model_doc/encodec) with several major differences:
 * it uses a much lower frame-rate.
 * it uses additional transformers for encoding and decoding for better latent contextualization
 * it uses a different quantization scheme: one codebook is dedicated to semantic projection.
--- a/docs/source/en/model_doc/minimax.md
+++ b/docs/source/en/model_doc/minimax.md
@ -115,9 +115,9 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech

 ## Shrinking down MiniMax using quantization

-As the MiniMax model has 456 billion parameters, that would require about 912GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), about 228 GB of RAM is required.
+As the MiniMax model has 456 billion parameters, that would require about 912GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), about 228 GB of RAM is required.

-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization) for alternative quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):

 ```python
 >>> import torch
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -139,10 +139,6 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl

 [[autodoc]] MistralConfig

-## MistralCommonTokenizer
-
-[[autodoc]] MistralCommonTokenizer
-
 ## MistralModel

 [[autodoc]] MistralModel
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@ -13,125 +13,116 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&amp;logo=pytorch&amp;logoColor=white">
-    </div>
-</div>

-# Mistral 3
+# Mistral3

-[Mistral 3](https://mistral.ai/news/mistral-small-3) is a latency optimized model with a lot fewer layers to reduce the time per forward pass. This model adds vision understanding and supports long context lengths of up to 128K tokens without compromising performance.
+## Overview

-You can find the original Mistral 3 checkpoints under the [Mistral AI](https://huggingface.co/mistralai/models?search=mistral-small-3) organization.
+Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks.

+It is ideal for:
+- Fast-response conversational agents.
+- Low-latency function calling.
+- Subject matter experts via fine-tuning.
+- Local inference for hobbyists and organizations handling sensitive data.
+- Programming and math reasoning.
+- Long document understanding.
+- Visual understanding.

-> [!TIP]
-> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).
-> Click on the Mistral3 models in the right sidebar for more examples of how to apply Mistral3 to different tasks.
+This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).

-The example below demonstrates how to generate text for an image with [`Pipeline`] and the [`AutoModel`] class.
+The original code can be found [here](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/pixtral.py) and [here](https://github.com/mistralai/mistral-common).

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+## Usage example

-```py
-import torch
-from transformers import pipeline
+### Inference with Pipeline

-messages = [
-    {"role": "user",
-        "content":[
-            {"type": "image",
-            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
-            {"type": "text", "text": "Describe this image."}
-        ,]
-    ,}
-,]
+Here is how you can use the `image-text-to-text` pipeline to perform inference with the `Mistral3` models in just a few lines of code:
+```python
+>>> from transformers import pipeline

-pipeline = pipeline(
-    task="image-text-to-text", 
-    model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", 
-    torch_dtype=torch.bfloat16,
-    device=0
-)
-outputs = pipeline(text=messages, max_new_tokens=50, return_full_text=False)
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {
+...                 "type": "image",
+...                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+...             },
+...             {"type": "text", "text": "Describe this image."},
+...         ],
+...     },
+... ]

-outputs[0]["generated_text"]
+>>> pipe = pipeline("image-text-to-text", model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", torch_dtype=torch.bfloat16)
+>>> outputs = pipe(text=messages, max_new_tokens=50, return_full_text=False)
+>>> outputs[0]["generated_text"]
 'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
 ```
-</hfoption>
-<hfoption id="AutoModel">
+### Inference on a single image

-```py
-import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText 
+This example demonstrates how to perform inference on a single image with the Mistral3 models using chat templates.

-torch_device = "cuda"
-model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_checkpoint, 
-    device_map=torch_device, 
-    torch_dtype=torch.bfloat16
-)
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch

-messages = [
-    {"role": "user",
-        "content":[
-            {"type": "image",
-            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
-            {"type": "text", "text": "Describe this image."}
-        ,]
-    ,}
-,]
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

-inputs = processor.apply_chat_template(
-    messages, 
-    add_generation_prompt=True, 
-    tokenize=True, return_dict=True, 
-    return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+...             {"type": "text", "text": "Describe this image"},
+...         ],
+...     }
+... ]

-generate_ids = model.generate(**inputs, max_new_tokens=20)
-decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

-decoded_output
-'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
+>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
+>>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+>>> decoded_output
+"The image depicts two cats lying on a pink blanket. The larger cat, which appears to be an"...
 ```
-</hfoption>
-</hfoptions>

-## Notes 
+### Text-only generation
+This example shows how to generate text using the Mistral3 model without providing any image input.

- Mistral 3 supports text-only generation. 
-```py 
-from transformers import AutoProcessor, AutoModelForImageTextToText
-import torch

-torch_device = "cuda"
-model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+````python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch

-SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
-user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."
+>>> torch_device = "cuda"
+>>> model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

-messages = [
-    {"role": "system", "content": SYSTEM_PROMPT},
-    {"role": "user", "content": user_prompt},
-]
+>>> SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
+>>> user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."

-text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
-generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
-decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]
+>>> messages = [
+...    {"role": "system", "content": SYSTEM_PROMPT},
+...    {"role": "user", "content": user_prompt},
+... ]

-print(decoded_output)
+>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+>>> inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
+>>> generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+>>> decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]
+
+>>> print(decoded_output)
 "1. À plus tard!
- 2. Salut, à plus!
- 3. À toute!
- 4. À la prochaine!
- 5. Je me casse, à plus!
+2. Salut, à plus!
+3. À toute!
+4. À la prochaine!
+5. Je me casse, à plus!

 ```
 /\_/\
@ -140,101 +131,102 @@ print(decoded_output)
 ```"
 ````

- Mistral 3 accepts batched image and text inputs. 
-```py
-from transformers import AutoProcessor, AutoModelForImageTextToText
-import torch
+### Batched image and text inputs
+Mistral3 models also support batched image and text inputs.

-torch_device = "cuda"
-model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch

-messages = [
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-                 {"type": "text", "text": "Write a haiku for this image"},
-             ],
-         },
-     ],
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-                 {"type": "text", "text": "Describe this image"},
-             ],
-         },
-     ],
- ]
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+...                 {"type": "text", "text": "Describe this image"},
+...             ],
+...         },
+...     ],
+... ]


- inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

- output = model.generate(**inputs, max_new_tokens=25)
+>>> output = model.generate(**inputs, max_new_tokens=25)

- decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
- decoded_outputs
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
 ["Write a haiku for this imageCalm waters reflect\nWhispers of the forest's breath\nPeace on wooden path"
 , "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"]
 ```

- Mistral 3 also supported batched image and text inputs with a different number of images for each text. The example below quantizes the model with bitsandbytes. 
+### Batched multi-image input and quantization with BitsAndBytes
+This implementation of the Mistral3 models supports batched text-images inputs with different number of images for each text.
+This example also how to use `BitsAndBytes` to load the model in 4bit quantization.

-```py 
-from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
-import torch
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+>>> import torch

-torch_device = "cuda"
-model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-model = AutoModelForImageTextToText.from_pretrained(
-     model_checkpoint, quantization_config=quantization_config
- )
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+>>> model = AutoModelForImageTextToText.from_pretrained(
+...     model_checkpoint, quantization_config=quantization_config
+... )

-messages = [
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-                 {"type": "text", "text": "Write a haiku for this image"},
-             ],
-         },
-     ],
-     [
-         {
-             "role": "user",
-             "content": [
-                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
-                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
-                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
-             ],
-         },
-     ],
- ]
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
+...                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
+...                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+...             ],
+...         },
+...     ],
+>>> ]

- inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

- output = model.generate(**inputs, max_new_tokens=25)
+>>> output = model.generate(**inputs, max_new_tokens=25)

- decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
- decoded_outputs
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
 ["Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's wooden path\nSilent forest stands guard\n", "These images depict two different landmarks. Can you identify them? Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City."]
 ```

+
 ## Mistral3Config

 [[autodoc]] Mistral3Config

-## MistralCommonTokenizer
-
-[[autodoc]] MistralCommonTokenizer
-
 ## Mistral3Model

 [[autodoc]] Mistral3Model
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -146,9 +146,9 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech

 ## Shrinking down Mixtral using quantization

-As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.
+As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.

-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization) for alternative quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):

 ```python
 >>> import torch
@ -197,10 +197,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h

 [[autodoc]] MixtralConfig

-## MistralCommonTokenizer
-
-[[autodoc]] MistralCommonTokenizer
-
 ## MixtralModel

 [[autodoc]] MixtralModel
--- a/docs/source/en/model_doc/mm-grounding-dino.md
+++ b/docs/source/en/model_doc/mm-grounding-dino.md
@ -1,124 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# MM Grounding DINO
-
-[MM Grounding DINO](https://arxiv.org/abs/2401.02361) model was proposed in [An Open and Comprehensive Pipeline for Unified Object Grounding and Detection](https://arxiv.org/abs/2401.02361) by Xiangyu Zhao, Yicheng Chen, Shilin Xu, Xiangtai Li, Xinjiang Wang, Yining Li, Haian Huang>.
-
-MM Grounding DINO improves upon the [Grounding DINO](https://huggingface.co/docs/transformers/model_doc/grounding-dino) by improving the contrastive class head and removing the parameter sharing in the decoder, improving zero-shot detection performance on both COCO (50.6(+2.2) AP) and LVIS (31.9(+11.8) val AP and 41.4(+12.6) minival AP).
-
-You can find all the original MM Grounding DINO checkpoints under the [MM Grounding DINO](https://huggingface.co/collections/openmmlab-community/mm-grounding-dino-688cbde05b814c4e2832f9df) collection. This model also supports LLMDet inference. You can find LLMDet checkpoints under the [LLMDet](https://huggingface.co/collections/iSEE-Laboratory/llmdet-688475906dc235d5f1dc678e) collection.
-
-> [!TIP]
-> Click on the MM Grounding DINO models in the right sidebar for more examples of how to apply MM Grounding DINO to different MM Grounding DINO tasks.
-
-The example below demonstrates how to generate text based on an image with the [`AutoModelForZeroShotObjectDetection`] class.
-
-<hfoptions id="usage">
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
-from transformers.image_utils import load_image
-
-
-# Prepare processor and model
-model_id = "openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
-
-# Prepare inputs
-image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = load_image(image_url)
-text_labels = [["a cat", "a remote control"]]
-inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
-
-# Run inference
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# Postprocess outputs
-results = processor.post_process_grounded_object_detection(
-    outputs,
-    threshold=0.4,
-    target_sizes=[(image.height, image.width)]
-)
-
-# Retrieve the first image result
-result = results[0]
-for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
-    box = [round(x, 2) for x in box.tolist()]
-    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
-```
-
-</hfoption>
-</hfoptions>
-
-## Notes
-
- Here's a table of models and their object detection performance results on COCO (results from [official repo](https://github.com/open-mmlab/mmdetection/blob/main/configs/mm_grounding_dino/README.md)):
-
-    |                                                              Model                                                             | Backbone |      Pre-Train Data      |   Style   |  COCO mAP  |
-    | ------------------------------------------------------------------------------------------------------------------------------ | -------- | ------------------------ | --------- | ---------- |
-    |  [mm_grounding_dino_tiny_o365v1_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg)                       |  Swin-T  |        O365,GoldG        | Zero-shot | 50.4(+2.3) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit)             |  Swin-T  |     O365,GoldG,GRIT      | Zero-shot | 50.5(+2.1) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det)           |  Swin-T  |     O365,GoldG,V3Det     | Zero-shot | 50.6(+2.2) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit_v3det) |  Swin-T  |  O365,GoldG,GRIT,V3Det   | Zero-shot | 50.4(+2.0) |
-    |  [mm_grounding_dino_base_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_base_o365v1_goldg_v3det)           |  Swin-B  |     O365,GoldG,V3Det     | Zero-shot |    52.5    |
-    |  [mm_grounding_dino_base_all](https://huggingface.co/openmmlab-community/mm_grounding_dino_base_all)                                         |  Swin-B  |         O365,ALL         |     -     |    59.5    |
-    |  [mm_grounding_dino_large_o365v2_oiv6_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_large_o365v2_oiv6_goldg)           |  Swin-L  | O365V2,OpenImageV6,GoldG | Zero-shot |    53.0    |
-    |  [mm_grounding_dino_large_all](https://huggingface.co/openmmlab-community/mm_grounding_dino_large_all)                                       |  Swin-L  |  O365V2,OpenImageV6,ALL  |     -     |    60.3    |
-
- Here's a table of MM Grounding DINO tiny models and their object detection performance on LVIS (results from [official repo](https://github.com/open-mmlab/mmdetection/blob/main/configs/mm_grounding_dino/README.md)):
-
-    |                                                              Model                                                             |    Pre-Train Data     | MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |
-    | ------------------------------------------------------------------------------------------------------------------------------ | --------------------- | ----------- | ----------- | ----------- | ----------- | ---------- | ---------- | ---------- | ----------- |
-    |  [mm_grounding_dino_tiny_o365v1_goldg](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg)                       |      O365,GoldG       |    28.1     |    30.2     |    42.0     | 35.7(+6.9)  |    17.1    |    22.4    |    36.5    | 27.0(+6.9)  |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit)             |    O365,GoldG,GRIT    |    26.6     |    32.4     |    41.8     | 36.5(+7.7)  |    17.3    |    22.6    |    36.4    | 27.1(+7.0)  |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det)           |   O365,GoldG,V3Det    |    33.0     |    36.0     |    45.9     | 40.5(+11.7) |    21.5    |    25.5    |    40.2    | 30.6(+10.5) |
-    |  [mm_grounding_dino_tiny_o365v1_goldg_grit_v3det](https://huggingface.co/openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_grit_v3det) | O365,GoldG,GRIT,V3Det |    34.2     |    37.4     |    46.2     | 41.4(+12.6) |    23.6    |    27.6    |    40.5    | 31.9(+11.8) |
-
-
- This implementation also supports inference for [LLMDet](https://github.com/iSEE-Laboratory/LLMDet). Here's a table of LLMDet models and their performance on LVIS (results from [official repo](https://github.com/iSEE-Laboratory/LLMDet)):
-
-    |                             Model                         | Pre-Train Data            |  MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |
-    | --------------------------------------------------------- | -------------------------------------------- | ------------ | ----------- | ----------- | ----------- | ---------- | ---------- | ---------- | ----------- |
-    | [llmdet_tiny](https://huggingface.co/iSEE-Laboratory/llmdet_tiny)   | (O365,GoldG,GRIT,V3Det) + GroundingCap-1M    | 44.7         | 37.3        | 39.5        | 50.7        | 34.9       | 26.0       | 30.1       | 44.3        |
-    | [llmdet_base](https://huggingface.co/iSEE-Laboratory/llmdet_base)   | (O365,GoldG,V3Det) + GroundingCap-1M         | 48.3         | 40.8        | 43.1        | 54.3        | 38.5       | 28.2       | 34.3       | 47.8        |
-    | [llmdet_large](https://huggingface.co/iSEE-Laboratory/llmdet_large) | (O365V2,OpenImageV6,GoldG) + GroundingCap-1M | 51.1         | 45.1        | 46.1        | 56.6        | 42.0       | 31.6       | 38.8       | 50.2        |
-
-
-## MMGroundingDinoConfig
-
-[[autodoc]] MMGroundingDinoConfig
-
-## MMGroundingDinoModel
-
-[[autodoc]] MMGroundingDinoModel
-    - forward
-
-## MMGroundingDinoForObjectDetection
-
-[[autodoc]] MMGroundingDinoForObjectDetection
-    - forward
--- a/docs/source/en/model_doc/modernbert-decoder.md
+++ b/docs/source/en/model_doc/modernbert-decoder.md
@ -1,188 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-  <div class="flex flex-wrap space-x-1">
-    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-  </div>
-</div>
-
-# ModernBERT Decoder
-
-ModernBERT Decoder has the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but it is trained from scratch with a causal language modeling objective from the [Ettin paper](https://huggingface.co/papers/2507.11412). This allows for using the same architecture to compare encoders and decoders. This model is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.
-
-ModernBERT Decoder uses sliding window attention and rotary positional embeddings for efficiency and to handle longer sequences.
-
-You can find all the original ModernBERT Decoder checkpoints under the [jhu-clsp](https://huggingface.co/collections/jhu-clsp/encoders-vs-decoders-the-ettin-suite-686303e16142257eed8e6aeb) collection.
-
-> [!TIP]
-> This model was contributed by [orionw](https://huggingface.co/orionweller).
->
-> Click on the ModernBERT Decoder models in the right sidebar for more examples of how to apply ModernBERT Decoder to different text generation tasks.
-
-The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`] (with and without quantization), and from the command line. 
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-import torch
-from transformers import pipeline
-
-generator = pipeline(
-    task="text-generation",
-    model="jhu-clsp/ettin-decoder-17m",
-    torch_dtype=torch.float16,
-    device=0
-)
-generator("The future of artificial intelligence is", max_length=50, num_return_sequences=1)
-
-# For sequence classification
-classifier = pipeline(
-    task="text-classification",
-    model="jhu-clsp/ettin-decoder-17m",
-    torch_dtype=torch.float16,
-    device=0
-)
-classifier("This movie is really great!")
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-17m")
-model = AutoModelForCausalLM.from_pretrained(
-    "jhu-clsp/ettin-decoder-17m",
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
-
-prompt = "The future of artificial intelligence is"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model.generate(
-        **inputs,
-        max_length=50,
-        num_return_sequences=1,
-        temperature=0.7,
-        do_sample=True,
-        pad_token_id=tokenizer.eos_token_id
-    )
-
-generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(f"Generated text: {generated_text}")
-
-# For sequence classification
-from transformers import AutoModelForSequenceClassification
-
-classifier_model = AutoModelForSequenceClassification.from_pretrained(
-    "jhu-clsp/ettin-decoder-17m",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    num_labels=2
-)
-
-text = "This movie is really great!"
-inputs = tokenizer(text, return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = classifier_model(**inputs)
-    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
-    predicted_class = torch.argmax(predictions, dim=-1)
-
-print(f"Predicted class: {predicted_class.item()}")
-print(f"Prediction probabilities: {predictions}")
-```
-
-</hfoption>
-
-<hfoption id="AutoModel (w/quantization)">
-
-```
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-)
-
-tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-1b")
-model = AutoModelForCausalLM.from_pretrained(
-    "jhu-clsp/ettin-decoder-1b",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-prompt = "The future of artificial intelligence is"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-
-with torch.no_grad():
-    outputs = model.generate(
-        **inputs,
-        max_length=50,
-        num_return_sequences=1,
-        temperature=0.7,
-        do_sample=True,
-        pad_token_id=tokenizer.eos_token_id
-    )
-
-generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(f"Generated text: {generated_text}")
-```
-</hfoption>
-
-<hfoption id="transformers CLI">
-
-```bash
-echo "The future of artificial intelligence is" | transformers run --task text-generation --model jhu-clsp/ettin-decoder-17m --device 0
-```
-
-</hfoption>
-</hfoptions>
-
-
-## ModernBertDecoderConfig
-
-[[autodoc]] ModernBertDecoderConfig
-
-<frameworkcontent>
-<pt>
-
-## ModernBertDecoderModel
-
-[[autodoc]] ModernBertDecoderModel
-    - forward
-
-## ModernBertDecoderForCausalLM
-
-[[autodoc]] ModernBertDecoderForCausalLM
-    - forward
-
-## ModernBertDecoderForSequenceClassification
-
-[[autodoc]] ModernBertDecoderForSequenceClassification
-    - forward
-
-</pt>
-</frameworkcontent>
--- a/docs/source/en/model_doc/mt5.md
+++ b/docs/source/en/model_doc/mt5.md
@ -14,115 +14,54 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
-    </div>
-</div>
-
 # mT5

-[mT5](https://huggingface.co/papers/2010.11934) is a multilingual variant of [T5](./t5), training on 101 languages. It also incorporates a new "accidental translation" technique to prevent the model from incorrectly translating predictions into the wrong language.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>

-You can find all the original [mT5] checkpoints under the [mT5](https://huggingface.co/collections/google/mt5-release-65005f1a520f8d7b4d039509) collection.
+## Overview

-> [!TIP]
-> This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
->
-> Click on the mT5 models in the right sidebar for more examples of how to apply mT5 to different language tasks.
+The mT5 model was presented in [mT5: A massively multilingual pre-trained text-to-text transformer](https://huggingface.co/papers/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
+Siddhant, Aditya Barua, Colin Raffel.

-The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
+state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
+the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
+benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
+generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
+checkpoints used in this work are publicly available.*

-```python
-import torch
-from transformers import pipeline
+Note: mT5 was only pre-trained on [mC4](https://huggingface.co/datasets/mc4) excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model.
+Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.

-pipeline = pipeline(
-    task="text2text-generation",
-    model="csebuetnlp/mT5_multilingual_XLSum",
-    torch_dtype=torch.float16,
-    device=0
-)
-pipeline("""Plants are remarkable organisms that produce their own food using a method called photosynthesis.
-This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
-Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems.""")
-```
+Google has released the following variants:

-</hfoption>
-<hfoption id="AutoModel">
+- [google/mt5-small](https://huggingface.co/google/mt5-small)

-```python
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+- [google/mt5-base](https://huggingface.co/google/mt5-base)

-tokenizer = AutoTokenizer.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum",
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
+- [google/mt5-large](https://huggingface.co/google/mt5-large)

-input_text = """Plants are remarkable organisms that produce their own food using a method called photosynthesis.
-This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
-Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+- [google/mt5-xl](https://huggingface.co/google/mt5-xl)

-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
+- [google/mt5-xxl](https://huggingface.co/google/mt5-xxl).

-</hfoption>
-<hfoption id="transformers CLI">
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten). The original code can be
+found [here](https://github.com/google-research/multilingual-t5).

-```bash
-echo -e "Plants are remarkable organisms that produce their own food using a method called photosynthesis." | transformers run --task text2text-generation --model csebuetnlp/mT5_multilingual_XLSum --device 0
-```
+## Resources

-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "csebuetnlp/mT5_multilingual_XLSum"
-)
-input_text = """Plants are remarkable organisms that produce their own food using a method called photosynthesis.
-This process involves converting sunlight, carbon dioxide, and water into glucose, which provides energy for growth.
-Plants play a crucial role in sustaining life on Earth by generating oxygen and serving as the foundation of most ecosystems."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-## Notes
-
- mT5 must be fine-tuned for downstream tasks because it was only pretrained on the [mc4](https://huggingface.co/datasets/mc4) dataset.
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)

 ## MT5Config

--- a/docs/source/en/model_doc/olmoe.md
+++ b/docs/source/en/model_doc/olmoe.md
@ -14,89 +14,27 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
+# OLMoE
+
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
-</div>

-# OLMoE
+## Overview

-[OLMoE](https://huggingface.co/papers/2409.02060) is a sparse Mixture-of-Experts (MoE) language model with 7B parameters but only 1B parameters are used per input token. It has similar inference costs as dense models but trains ~3x faster. OLMoE uses fine-grained routing with 64 small experts in each layer and uses a dropless token-based routing algorithm.
+The OLMoE model was proposed in [OLMoE: Open Mixture-of-Experts Language Models](https://huggingface.co/papers/2409.02060) by Niklas Muennighoff, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Jacob Morrison, Sewon Min, Weijia Shi, Pete Walsh, Oyvind Tafjord, Nathan Lambert, Yuling Gu, Shane Arora, Akshita Bhagia, Dustin Schwenk, David Wadden, Alexander Wettig, Binyuan Hui, Tim Dettmers, Douwe Kiela, Ali Farhadi, Noah A. Smith, Pang Wei Koh, Amanpreet Singh, Hannaneh Hajishirzi.

-You can find all the original OLMoE checkpoints under the [OLMoE](https://huggingface.co/collections/allenai/olmoe-november-2024-66cf678c047657a30c8cd3da) collection.
+OLMoE is a series of **O**pen **L**anguage **Mo**dels using sparse **M**ixture-**o**f-**E**xperts designed to enable the science of language models. We release all code, checkpoints, logs, and details involved in training these models.

-> [!TIP]
-> This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
->
-> Click on the OLMoE models in the right sidebar for more examples of how to apply OLMoE to different language tasks.
+The abstract from the paper is the following:

-The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class.
+*We introduce OLMoE, a fully open, state-of-the-art language model leveraging sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but uses only 1B per input token. We pretrain it on 5 trillion tokens and further adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available models with similar active parameters, even surpassing larger ones like Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE training, analyze routing in our model showing high specialization, and open-source all aspects of our work: model weights, training data, code, and logs.*

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
+The original code can be found [here](https://github.com/allenai/OLMoE).

-```py
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    task="text-generation",
-    model="allenai/OLMoE-1B-7B-0125",
-    torch_dtype=torch.float16,
-    device=0,
-)
-
-result = pipe("Dionysus is the god of")
-print(result)
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto").to(device)
-tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
-
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.to(device) for k, v in inputs.items()}
-output = model.generate(**inputs, max_length=64)
-print(tokenizer.decode(output[0]))
-```
-
-## Quantization
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-quantization_config = BitsAndBytesConfig(
-   load_in_4bit=True,
-   bnb_4bit_compute_dtype=torch.float16,
-   bnb_4bit_use_double_quant=True,
-   bnb_4bit_quant_type="nf4"
-)
-
-model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto", quantization_config=quantization_config).to(device)
-tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
-
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.to(device) for k, v in inputs.items()}
-output = model.generate(**inputs, max_length=64)
-print(tokenizer.decode(output[0]))
-```

 ## OlmoeConfig

--- a/docs/source/en/model_doc/oneformer.md
+++ b/docs/source/en/model_doc/oneformer.md
@ -38,7 +38,7 @@ This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3

 ## Usage tips

-  OneFormer requires two inputs during inference: *image* and *task token*.
+-  OneFormer requires two inputs during inference: *image* and *task token*. 
 - During training, OneFormer only uses panoptic annotations.
 - If you want to train the model in a distributed environment across multiple nodes, then one should update the
  `get_num_masks` function inside in the `OneFormerLoss` class of `modeling_oneformer.py`. When training on multiple nodes, this should be
@ -69,14 +69,7 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerImageProcessor
    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## OneFormerImageProcessorFast
-
-[[autodoc]] OneFormerImageProcessorFast
-    - preprocess
+    - encode_inputs
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
@ -94,3 +87,4 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerForUniversalSegmentation
    - forward
+    
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@ -1,101 +1,194 @@
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-           <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N9lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFmsnSos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQsKKaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScSKSBqKCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD82gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtbREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG23nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-           <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-           <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->

 # OPT

-[OPT](https://huggingface.co/papers/2205.01068) is a suite of open-source decoder-only pre-trained transformers whose parameters range from 125M to 175B. OPT models are designed for casual language modeling and aim to enable responsible and reproducible research at scale. OPT-175B is comparable in performance to GPT-3 with only 1/7th the carbon footprint.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original OPT checkpoints under the [OPT](https://huggingface.co/collections/facebook/opt-66ed00e15599f02966818844) collection.
+## Overview

-> [!TIP]
-> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ), [ybelkada](https://huggingface.co/ybelkada), and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
->
-> Click on the OPT models in the right sidebar for more examples of how to apply OPT to different language tasks.
+The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://huggingface.co/papers/2205.01068) by Meta AI.
+OPT is a series of open-sourced large causal language models which perform similar in performance to GPT3.

-The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
+The abstract from the paper is the following:

+*Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*

-<hfoptions id="usage">
-<hfoption id="Pipeline">
-  
-```py  
-import torch
-from transformers import pipeline
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
+The original code can be found [here](https://github.com/facebookresearch/metaseq).

-pipeline = pipeline(task="text-generation", model="facebook/opt-125m", torch_dtype=torch.float16, device=0)
-pipeline("Once upon a time, in a land far, far away,", max_length=50, num_return_sequences=1)
-```
+Tips:
+- OPT has the same architecture as [`BartDecoder`].
+- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.

-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-device = "cuda"
-
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-
-prompt = ("Once upon a time, in a land far, far away, ")
-
-model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
-model.to(device)
-
-generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
-tokenizer.batch_decode(generated_ids)[0]
-```
-</hfoption>
-<hfoption id="transformers CLI">
-
-```py
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model facebook/opt-125m --device 0
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](..quantization/bitsandbytes) to quantize the weights to 8-bits.
-
-```py
-import torch
-from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
-
-device = "cuda"
-
-bnb_config = BitsAndBytesConfig(load_in_8bit=True)
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-13b", torch_dtype=torch.float16, attn_implementation="sdpa", quantization_config=bnb_config)
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-13b")
-
-prompt = ("Once upon a time, in a land far, far away, ")
-
-model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
-model.to(device)
-
-generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
-tokenizer.batch_decode(generated_ids)[0]
-```
-
-## Notes
-
- OPT adds an `EOS` token `</s>` to the beginning of every prompt.
-
- The `head_mask` argument is ignored if the attention implementation isn't `"eager"`. Set `attn_implementation="eager"` to enable the `head_mask`.
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`

 ## Resources

- Refer to this [notebook](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing) for an example of fine-tuning OPT with PEFT, bitsandbytes, and Transformers.
- The [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) blog post demonstrates how to run OPT for inference.
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
+interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+<PipelineTag pipeline="text-generation" />
+
+- A notebook on [fine-tuning OPT with PEFT, bitsandbytes, and Transformers](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing). 🌎
+- A blog post on [decoding strategies with OPT](https://huggingface.co/blog/introducing-csearch#62-example-two---opt).
+- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
+- [`OPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling).
+
+<PipelineTag pipeline="text-classification" />
+
+- [Text classification task guide](sequence_classification.md)
+- [`OPTForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
+
+<PipelineTag pipeline="question-answering" />
+
+- [`OPTForQuestionAnswering`] is supported by this [question answering example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter
+  of the 🤗 Hugging Face Course.
+
+⚡️ Inference
+
+- A blog post on [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) with OPT.
+
+
+## Combining OPT and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> from transformers import OPTForCausalLM, GPT2Tokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
+>>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
+
+>>> prompt = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
+              "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
+              "there?")
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+>>> tokenizer.batch_decode(generated_ids)[0]
+'</s>A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for about a year.\nHuman: What is your favorite place to eat?\nStatue: I love'
+```
+
+### Expected speedups
+
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-2.7b` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
+
+<div style="text-align: center">
+<img src="https://user-images.githubusercontent.com/49240599/281101546-d2fca6d2-ee44-48f3-9534-ba8d5bee4531.png">
+</div>
+
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-350m` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
+
+<div style="text-align: center">
+<img src="https://user-images.githubusercontent.com/49240599/281101682-d1144e90-0dbc-46f4-8fc8-c6206cb793c9.png">
+</div>
+
+
+### Using Scaled Dot Product Attention (SDPA)
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```python
+from transformers import OPTForCausalLM
+model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (L40S-45GB, PyTorch 2.4.0, OS Debian GNU/Linux 11) using `float16` with
+[facebook/opt-350m](https://huggingface.co/facebook/opt-350m), we saw the
+following speedups during training and inference.
+
+### Training
+
+|    batch_size |    seq_len |  Time per batch (eager - s)   |    Time per batch (sdpa - s) |  Speedup (%)   |  Eager peak mem (MB)   |    sdpa peak mem (MB) |  Mem saving (%)   |
+|--------------:|-----------:|:------------------------------|-----------------------------:|:---------------|:-----------------------|----------------------:|:------------------|
+|             1 |        128 | 0.047                         |                        0.037 | 26.360         | 1474.611               |               1474.32 | 0.019             |
+|             1 |        256 | 0.046                         |                        0.037 | 24.335         | 1498.541               |               1499.49 | -0.063            |
+|             1 |        512 | 0.046                         |                        0.037 | 24.959         | 1973.544               |               1551.35 | 27.215            |
+|             1 |       1024 | 0.062                         |                        0.038 | 65.135         | 4867.113               |               1698.35 | 186.578           |
+|             1 |       2048 | 0.230                         |                        0.039 | 483.933        | 15662.224              |               2715.75 | 476.718           |
+|             2 |        128 | 0.045                         |                        0.037 | 20.455         | 1498.164               |               1499.49 | -0.089            |
+|             2 |        256 | 0.046                         |                        0.037 | 24.027         | 1569.367               |               1551.35 | 1.161             |
+|             2 |        512 | 0.045                         |                        0.037 | 20.965         | 3257.074               |               1698.35 | 91.778            |
+|             2 |       1024 | 0.122                         |                        0.038 | 225.958        | 9054.405               |               2715.75 | 233.403           |
+|             2 |       2048 | 0.464                         |                        0.067 | 593.646        | 30572.058              |               4750.55 | 543.548           |
+|             4 |        128 | 0.045                         |                        0.037 | 21.918         | 1549.448               |               1551.35 | -0.123            |
+|             4 |        256 | 0.044                         |                        0.038 | 18.084         | 2451.768               |               1698.35 | 44.361            |
+|             4 |        512 | 0.069                         |                        0.037 | 84.421         | 5833.180               |               2715.75 | 114.791           |
+|             4 |       1024 | 0.262                         |                        0.062 | 319.475        | 17427.842              |               4750.55 | 266.860           |
+|             4 |       2048 | OOM                           |                        0.062 | Eager OOM      | OOM                    |               4750.55 | Eager OOM         |
+|             8 |        128 | 0.044                         |                        0.037 | 18.436         | 2049.115               |               1697.78 | 20.694            |
+|             8 |        256 | 0.048                         |                        0.036 | 32.887         | 4222.567               |               2715.75 | 55.484            |
+|             8 |        512 | 0.153                         |                        0.06  | 154.862        | 10985.391              |               4750.55 | 131.245           |
+|             8 |       1024 | 0.526                         |                        0.122 | 330.697        | 34175.763              |               8821.18 | 287.428           |
+|             8 |       2048 | OOM                           |                        0.122 | Eager OOM      | OOM                    |               8821.18 | Eager OOM         |
+
+### Inference
+
+|    batch_size |    seq_len |    Per token latency eager (ms) |    Per token latency SDPA (ms) |    Speedup (%) |    Mem eager (MB) |    Mem BT (MB) |    Mem saved (%) |
+|--------------:|-----------:|--------------------------------:|-------------------------------:|---------------:|------------------:|---------------:|-----------------:|
+|             1 |        128 |                          11.634 |                          8.647 |         34.546 |           717.676 |        717.674 |            0     |
+|             1 |        256 |                          11.593 |                          8.86  |         30.851 |           742.852 |        742.845 |            0.001 |
+|             1 |        512 |                          11.515 |                          8.816 |         30.614 |           798.232 |        799.593 |           -0.17  |
+|             1 |       1024 |                          11.556 |                          8.915 |         29.628 |           917.265 |        895.538 |            2.426 |
+|             2 |        128 |                          12.724 |                         11.002 |         15.659 |           762.434 |        762.431 |            0     |
+|             2 |        256 |                          12.704 |                         11.063 |         14.83  |           816.809 |        816.733 |            0.009 |
+|             2 |        512 |                          12.757 |                         10.947 |         16.535 |           917.383 |        918.339 |           -0.104 |
+|             2 |       1024 |                          13.018 |                         11.018 |         18.147 |          1162.65  |       1114.81  |            4.291 |
+|             4 |        128 |                          12.739 |                         10.959 |         16.243 |           856.335 |        856.483 |           -0.017 |
+|             4 |        256 |                          12.718 |                         10.837 |         17.355 |           957.298 |        957.674 |           -0.039 |
+|             4 |        512 |                          12.813 |                         10.822 |         18.393 |          1158.44  |       1158.45  |           -0.001 |
+|             4 |       1024 |                          13.416 |                         11.06  |         21.301 |          1653.42  |       1557.19  |            6.18  |
+|             8 |        128 |                          12.763 |                         10.891 |         17.193 |          1036.13  |       1036.51  |           -0.036 |
+|             8 |        256 |                          12.89  |                         11.104 |         16.085 |          1236.98  |       1236.87  |            0.01  |
+|             8 |        512 |                          13.327 |                         10.939 |         21.836 |          1642.29  |       1641.78  |            0.031 |
+|             8 |       1024 |                          15.181 |                         11.175 |         35.848 |          2634.98  |       2443.35  |            7.843 |

 ## OPTConfig

--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@ -106,13 +106,6 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
    - post_process_object_detection
    - post_process_image_guided_detection

-## Owlv2ImageProcessorFast
-
-[[autodoc]] Owlv2ImageProcessorFast
-    - preprocess
-    - post_process_object_detection
-    - post_process_image_guided_detection
-
 ## Owlv2Processor

 [[autodoc]] Owlv2Processor
--- a/docs/source/en/model_doc/patchtsmixer.md
+++ b/docs/source/en/model_doc/patchtsmixer.md
@ -38,7 +38,7 @@ This model was contributed by [ajati](https://huggingface.co/ajati), [vijaye12](

 ## Usage example

-The code snippet below shows how to randomly initialize a PatchTSMixer model. The model is compatible with the [Trainer API](../trainer).
+The code snippet below shows how to randomly initialize a PatchTSMixer model. The model is compatible with the [Trainer API](../trainer.md).

 ```python

--- a/docs/source/en/model_doc/perception_lm.md
+++ b/docs/source/en/model_doc/perception_lm.md
@ -1,68 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# PerceptionLM
-
-## Overview
-
-The PerceptionLM model was proposed in [PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding/) by Jang Hyun Cho et al. It's a fully open, reproducible model for transparent research in image and video understanding. PLM consists of
-a vision encoder with a small scale (<8B parameters) LLM decoder.
-
-The abstract from the paper is the following:
-
-*Vision-language models are integral to computer vision research, yet many high-performing models
-remain closed-source, obscuring their data, design and training recipe. The research community
-has responded by using distillation from black-box models to label training data, achieving strong
-benchmark results, at the cost of measurable scientific progress. However, without knowing the details
-of the teacher model and its data sources, scientific progress remains difficult to measure. In this
-paper, we study building a Perception Language Model (PLM) in a fully open and reproducible
-framework for transparent research in image and video understanding. We analyze standard training
-pipelines without distillation from proprietary models and explore large-scale synthetic data to identify
-critical data gaps, particularly in detailed video understanding. To bridge these gaps, we release 2.8M
-human-labeled instances of fine-grained video question-answer pairs and spatio-temporally grounded
-video captions. Additionally, we introduce PLM–VideoBench, a suite for evaluating challenging video
-understanding tasks focusing on the ability to reason about “what”, “where”, “when”, and “how” of a
-video. We make our work fully reproducible by providing data, training recipes, code & models.*
-
-
-This model was contributed by [shumingh](https://huggingface.co/shumingh).
-The original code can be found [here](https://github.com/facebookresearch/perception_models).
-
-
-## PerceptionLMConfig
-
-[[autodoc]] PerceptionLMConfig
-
-## PerceptionLMProcessor
-
-[[autodoc]] PerceptionLMProcessor
-
-## PerceptionLMImageProcessorFast
-
-[[autodoc]] PerceptionLMImageProcessorFast
-
-## PerceptionLMVideoProcessor
-
-[[autodoc]] PerceptionLMVideoProcessor
-
-## PerceptionLMModel
-
-[[autodoc]] PerceptionLMModel
-
-## PerceptionLMForConditionalGeneration
-
-[[autodoc]] PerceptionLMForConditionalGeneration
-    - forward
--- a/docs/source/en/model_doc/phi4_multimodal.md
+++ b/docs/source/en/model_doc/phi4_multimodal.md
@ -9,53 +9,44 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->

-<div style="float: right;">
-  <div class="flex flex-wrap space-x-1">
-    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-EE4C2C?logo=pytorch&logoColor=white&style=flat">
-  </div>
-</div>
+# Phi4 Multimodal

-## Phi4 Multimodal
+## Overview

-[Phi4 Multimodal](https://huggingface.co/papers/2503.01743) is a multimodal model capable of text, image, and speech and audio inputs or any combination of these. It features a mixture of LoRA adapters for handling different inputs, and each input is routed to the appropriate encoder.
+Phi4 Multimodal is a lightweight open multimodal foundation model that leverages the language, vision, and speech research and datasets used for Phi-3.5 and 4.0 models. The model processes text, image, and audio inputs, generating text outputs, and comes with 128K token context length. The model underwent an enhancement process, incorporating both supervised fine-tuning, direct preference optimization and RLHF (Reinforcement Learning from Human Feedback) to support precise instruction adherence and safety measures. The languages that each modal supports are the following:

-You can find all the original Phi4 Multimodal checkpoints under the [Phi4](https://huggingface.co/collections/microsoft/phi-4-677e9380e514feb5577a40e4) collection.
+- Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian
+- Vision: English
+- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese

-> [!TIP]
-> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez).
->
-> Click on the Phi-4 Multimodal in the right sidebar for more examples of how to apply Phi-4 Multimodal to different tasks.
+This model was contributed by [Cyril Vallez](https://huggingface.co/cyrilvallez). The most recent code can be
+found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py).

-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+## Usage tips

-```python
-from transformers import pipeline
-generator = pipeline("text-generation", model="microsoft/Phi-4-multimodal-instruct", torch_dtype="auto", device=0)
+`Phi4-multimodal-instruct` can be found on the [Huggingface Hub](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)

-prompt = "Explain the concept of multimodal AI in simple terms."
-
-result = generator(prompt, max_length=50)
-print(result[0]['generated_text'])
-```
-
-</hfoption>
-<hfoption id="AutoModel">
+In the following, we demonstrate how to use it for inference depending on the input modalities (text, image, audio).

 ```python
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

+
+# Define model path
 model_path = "microsoft/Phi-4-multimodal-instruct"
 device = "cuda:0"

+# Load model and processor
 processor = AutoProcessor.from_pretrained(model_path)
-model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device,  torch_dtype=torch.float16)

+# Optional: load the adapters (note that without them, the base model will very likely not work well)
+model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
 model.load_adapter(model_path, adapter_name="vision", device_map=device, adapter_kwargs={"subfolder": 'vision-lora'})

+# Part : Image Processing
 messages = [
    {
        "role": "user",
@ -66,7 +57,7 @@ messages = [
    },
 ]

-model.set_adapter("vision")
+model.set_adapter("vision") # if loaded, activate the vision adapter
 inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
@ -75,6 +66,7 @@ inputs = processor.apply_chat_template(
    return_tensors="pt",
 ).to(device)

+# Generate response
 generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
@ -85,27 +77,10 @@ response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
-```

-</hfoption>
-</hfoptions>

-## Notes
-
-The example below demonstrates inference with an audio and text input.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
-
-model_path = "microsoft/Phi-4-multimodal-instruct"
-device = "cuda:0"
-
-processor = AutoProcessor.from_pretrained(model_path)
-model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device,  torch_dtype=torch.float16)
-
-model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
-model.set_adapter("speech")
+# Part 2: Audio Processing
+model.set_adapter("speech") # if loaded, activate the speech adapter
 audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
 messages = [
    {
@ -135,7 +110,6 @@ response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
-
 ```

 ## Phi4MultimodalFeatureExtractor
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@ -86,10 +86,6 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up

 [[autodoc]] PixtralVisionConfig

-## MistralCommonTokenizer
-
-[[autodoc]] MistralCommonTokenizer
-
 ## PixtralVisionModel

 [[autodoc]] PixtralVisionModel
--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.
 # Qwen2MoE


-[Qwen2MoE](https://huggingface.co/papers/2407.10671) is a Mixture-of-Experts (MoE) variant of [Qwen2](./qwen2), available as a base model and an aligned chat model. It uses SwiGLU activation, group query attention and a mixture of sliding window attention and full attention. The tokenizer can also be adapted to multiple languages and codes.
+[Qwen2MoE]((https://huggingface.co/papers/2407.10671) ) is a Mixture-of-Experts (MoE) variant of [Qwen2](./qwen2), available as a base model and an aligned chat model. It uses SwiGLU activation, group query attention and a mixture of sliding window attention and full attention. The tokenizer can also be adapted to multiple languages and codes.

 The MoE architecture uses upcyled models from the dense language models. For example, Qwen1.5-MoE-A2.7B is upcycled from Qwen-1.8B. It has 14.3B parameters but only 2.7B parameters are activated during runtime.

--- a/docs/source/en/model_doc/sam.md
+++ b/docs/source/en/model_doc/sam.md
@ -25,7 +25,7 @@ rendered properly in your Markdown viewer.

 SAM (Segment Anything Model) was proposed in [Segment Anything](https://huggingface.co/papers/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.

-The model can be used to predict segmentation masks of any object of interest given an input image.
+The model can be used to predict segmentation masks of any object of interest given an input image. 

 ![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)

@ -37,9 +37,9 @@ Tips:

 - The model predicts binary masks that states the presence or not of the object of interest given an image.
 - The model predicts much better results if input 2D points and/or input bounding boxes are provided
- You can prompt multiple points for the same image, and predict a single mask.
+- You can prompt multiple points for the same image, and predict a single mask. 
 - Fine-tuning the model is not supported yet
- According to the paper, textual input should be also supported. However, at this time of writing this seems not to be supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844).
+- According to the paper, textual input should be also supported. However, at this time of writing this seems not to be supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). 


 This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
@ -149,11 +149,6 @@ alt="drawing" width="900"/>
 [[autodoc]] SamImageProcessor


-## SamImageProcessorFast
-
-[[autodoc]] SamImageProcessorFast
-
-
 ## SamVisionModel

 [[autodoc]] SamVisionModel
--- a/docs/source/en/model_doc/segformer.md
+++ b/docs/source/en/model_doc/segformer.md
@ -128,12 +128,6 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

-## SegformerImageProcessorFast
-
-[[autodoc]] SegformerImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-
 <frameworkcontent>
 <pt>

@ -181,4 +175,4 @@ If you're interested in submitting a resource to be included here, please feel f
    - call

 </tf>
-</frameworkcontent>
+</frameworkcontent>
--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@ -10,31 +10,40 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

-->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-    </div>
-</div>
+-->

 # SuperGlue

-[SuperGlue](https://huggingface.co/papers/1911.11763) is a neural network that matches two sets of local features by jointly finding correspondences and rejecting non-matchable points. Assignments are estimated by solving a differentiable optimal transport problem, whose costs are predicted by a graph neural network. SuperGlue introduces a flexible context aggregation mechanism based on attention, enabling it to reason about the underlying 3D scene and feature assignments jointly. Paired with the [SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original SuperGlue checkpoints under the [Magic Leap Community](https://huggingface.co/magic-leap-community) organization.
+## Overview

-> [!TIP]
-> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
->
-> Click on the SuperGlue models in the right sidebar for more examples of how to apply SuperGlue to different computer vision tasks.
+The SuperGlue model was proposed in [SuperGlue: Learning Feature Matching with Graph Neural Networks](https://huggingface.co/papers/1911.11763) by Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.

-The example below demonstrates how to match keypoints between two images with the [`AutoModel`] class.
+This model consists of matching two sets of interest points detected in an image. Paired with the 
+[SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and 
+estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc.

-<hfoptions id="usage">
-<hfoption id="AutoModel">
+The abstract from the paper is the following:

-```py
+*This paper introduces SuperGlue, a neural network that matches two sets of local features by jointly finding correspondences 
+and rejecting non-matchable points. Assignments are estimated by solving a differentiable optimal transport problem, whose costs 
+are predicted by a graph neural network. We introduce a flexible context aggregation mechanism based on attention, enabling 
+SuperGlue to reason about the underlying 3D scene and feature assignments jointly. Compared to traditional, hand-designed heuristics, 
+our technique learns priors over geometric transformations and regularities of the 3D world through end-to-end training from image 
+pairs. SuperGlue outperforms other learned approaches and achieves state-of-the-art results on the task of pose estimation in 
+challenging real-world indoor and outdoor environments. The proposed method performs matching in real-time on a modern GPU and 
+can be readily integrated into modern SfM or SLAM systems. The code and trained weights are publicly available at this [URL](https://github.com/magicleap/SuperGluePretrainedNetwork).*
+
+## How to use
+
+Here is a quick example of using the model. Since this model is an image matching model, it requires pairs of images to be matched. 
+The raw outputs contain the list of keypoints detected by the keypoint detector as well as the list of matches with their corresponding 
+matching scores.
+```python
 from transformers import AutoImageProcessor, AutoModel
 import torch
 from PIL import Image
@ -43,7 +52,7 @@ import requests
 url_image1 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg"
 image1 = Image.open(requests.get(url_image1, stream=True).raw)
 url_image2 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg"
-image2 = Image.open(requests.get(url_image2, stream=True).raw)
+image_2 = Image.open(requests.get(url_image2, stream=True).raw)

 images = [image1, image2]

@ -53,70 +62,67 @@ model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")
 inputs = processor(images, return_tensors="pt")
 with torch.no_grad():
    outputs = model(**inputs)
-
-# Post-process to get keypoints and matches
-image_sizes = [[(image.height, image.width) for image in images]]
-processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
 ```

-</hfoption>
-</hfoptions>
+You can use the `post_process_keypoint_matching` method from the `SuperGlueImageProcessor` to get the keypoints and matches in a more readable format:

-## Notes
+```python
+image_sizes = [[(image.height, image.width) for image in images]]
+outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+for i, output in enumerate(outputs):
+    print("For the image pair", i)
+    for keypoint0, keypoint1, matching_score in zip(
+            output["keypoints0"], output["keypoints1"], output["matching_scores"]
+    ):
+        print(
+            f"Keypoint at coordinate {keypoint0.numpy()} in the first image matches with keypoint at coordinate {keypoint1.numpy()} in the second image with a score of {matching_score}."
+        )

- SuperGlue performs feature matching between two images simultaneously, requiring pairs of images as input.
+```

-    ```python
-    from transformers import AutoImageProcessor, AutoModel
-    import torch
-    from PIL import Image
-    import requests
-    
-    processor = AutoImageProcessor.from_pretrained("magic-leap-community/superglue_outdoor")
-    model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")
-    
-    # SuperGlue requires pairs of images
-    images = [image1, image2]
-    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
-    
-    # Extract matching information
-    keypoints0 = outputs.keypoints0  # Keypoints in first image
-    keypoints1 = outputs.keypoints1  # Keypoints in second image
-    matches = outputs.matches        # Matching indices
-    matching_scores = outputs.matching_scores  # Confidence scores
-    ```
+From the outputs, you can visualize the matches between the two images using the following code:
+```python
+import matplotlib.pyplot as plt
+import numpy as np

- The model outputs matching indices, keypoints, and confidence scores for each match.
- For better visualization and analysis, use the [`SuperGlueImageProcessor.post_process_keypoint_matching`] method to get matches in a more readable format.
+# Create side by side image
+merged_image = np.zeros((max(image1.height, image2.height), image1.width + image2.width, 3))
+merged_image[: image1.height, : image1.width] = np.array(image1) / 255.0
+merged_image[: image2.height, image1.width :] = np.array(image2) / 255.0
+plt.imshow(merged_image)
+plt.axis("off")

-    ```py
-    # Process outputs for visualization
-    image_sizes = [[(image.height, image.width) for image in images]]
-    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-    
-    for i, output in enumerate(processed_outputs):
-        print(f"For the image pair {i}")
-        for keypoint0, keypoint1, matching_score in zip(
-                output["keypoints0"], output["keypoints1"], output["matching_scores"]
-        ):
-            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
-    ```
+# Retrieve the keypoints and matches
+output = outputs[0]
+keypoints0 = output["keypoints0"]
+keypoints1 = output["keypoints1"]
+matching_scores = output["matching_scores"]
+keypoints0_x, keypoints0_y = keypoints0[:, 0].numpy(), keypoints0[:, 1].numpy()
+keypoints1_x, keypoints1_y = keypoints1[:, 0].numpy(), keypoints1[:, 1].numpy()

- Visualize the matches between the images using the built-in plotting functionality.
+# Plot the matches
+for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip(
+        keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, matching_scores
+):
+    plt.plot(
+        [keypoint0_x, keypoint1_x + image1.width],
+        [keypoint0_y, keypoint1_y],
+        color=plt.get_cmap("RdYlGn")(matching_score.item()),
+        alpha=0.9,
+        linewidth=0.5,
+    )
+    plt.scatter(keypoint0_x, keypoint0_y, c="black", s=2)
+    plt.scatter(keypoint1_x + image1.width, keypoint1_y, c="black", s=2)

-    ```py
-    # Easy visualization using the built-in plotting method
-    processor.visualize_keypoint_matching(images, processed_outputs)
-    ```
+# Save the plot
+plt.savefig("matched_image.png", dpi=300, bbox_inches='tight')
+plt.close()
+```

-<div class="flex justify-center">
-    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/01ZYaLB1NL5XdA8u7yCo4.png">
-</div>
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/01ZYaLB1NL5XdA8u7yCo4.png)

-## Resources
-
- Refer to the [original SuperGlue repository](https://github.com/magicleap/SuperGluePretrainedNetwork) for more examples and implementation details.
+This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+The original code can be found [here](https://github.com/magicleap/SuperGluePretrainedNetwork).

 ## SuperGlueConfig

@ -127,16 +133,10 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
 [[autodoc]] SuperGlueImageProcessor

 - preprocess
- post_process_keypoint_matching
- visualize_keypoint_matching

-<frameworkcontent>
-<pt>
 ## SuperGlueForKeypointMatching

 [[autodoc]] SuperGlueForKeypointMatching

 - forward
-
-</pt>
-</frameworkcontent>
+- post_process_keypoint_matching
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@ -130,11 +130,6 @@ processed_outputs = processor.post_process_keypoint_detection(outputs, [image_si

 [[autodoc]] SuperPointImageProcessor

- preprocess
-
-## SuperPointImageProcessorFast
-
-[[autodoc]] SuperPointImageProcessorFast
 - preprocess
 - post_process_keypoint_detection

--- a/docs/source/en/model_doc/switch_transformers.md
+++ b/docs/source/en/model_doc/switch_transformers.md
@ -14,90 +14,35 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
+# SwitchTransformers
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

-# Switch Transformers
+## Overview

-[Switch Transformers](https://huggingface.co/papers/2101.03961) is a sparse T5 model where the MLP layer is replaced by a Mixture-of-Experts (MoE). A routing mechanism associates each token with an expert and each expert is a dense MLP. Sparsity enables better scaling and the routing mechanism allows the model to select relevant weights on the fly which increases model capacity.
+The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://huggingface.co/papers/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.

-You can find all the original Switch Transformers checkpoints under the [Switch Transformer](https://huggingface.co/collections/google/switch-transformers-release-6548c35c6507968374b56d1f) collection.
+The Switch Transformer model uses a sparse T5 encoder-decoder architecture, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale.
+During a forward pass, only a fraction of the weights are used. The routing mechanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations.

+The abstract from the paper is the following:

-> [!TIP]
-> This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
->
-> Click on the Switch Transformers models in the right sidebar for more examples of how to apply Switch Transformers to different natural language tasks.
+*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.*

-The example below demonstrates how to predict the masked token with [`Pipeline`], [`AutoModel`], and from the command line.
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+## Usage tips

-```python
-import torch
-from transformers import pipeline
+- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository.
+- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned.

-pipeline = pipeline(
-    task="text2text-generation", 
-    model="google/switch-base-8",
-    torch_dtype=torch.float16,
-    device=0
-)
-print(pipeline("The capital of France is <extra_id_0>."))
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```python
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
-model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", torch_dtype=torch.float16)
-
-input_text = "The capital of France is <extra_id_0>."
-input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
-
-outputs = model.generate(input_ids)
-print(tokenizer.decode(outputs[0]))
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "The capital of France is <extra_id_0>." | transformers run --task text2text-generation --model google/switch-base-8 --device 0
-# [{'generated_text': 'Paris.'}]
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes/) to only quantize the weights to 8-bits.
-
-```py
-# pip install bitsandbytes
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
-
-tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", quantization_config=quantization_config)
-
-input_text = "The capital of France is <extra_id_0>."
-input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
-
-outputs = model.generate(input_ids)
-print(tokenizer.decode(outputs[0]))
-```
+## Resources

+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)

 ## SwitchTransformersConfig

--- a/docs/source/en/model_doc/t5gemma.md
+++ b/docs/source/en/model_doc/t5gemma.md
@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.

 # T5Gemma

-T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large language models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.
+T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large langauge models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.

 T5Gemma has two groups of model sizes: 1) [Gemma 2](https://ai.google.dev/gemma/docs/core/model_card_2) sizes (2B-2B, 9B-2B, and 9B-9B), which are based on the offical Gemma 2 models (2B and 9B); and 2) [T5](https://arxiv.org/abs/1910.10683) sizes (Small, Base, Large, and XL), where are pretrained under the Gemma 2 framework following T5 configuration. In addition, we also provide a model at ML size (medium large, ~2B in total), which is in-between T5 Large and T5 XL.

--- a/docs/source/en/model_doc/timesfm.md
+++ b/docs/source/en/model_doc/timesfm.md
@ -37,7 +37,6 @@ The original code can be found [here](https://github.com/google-research/timesfm
 To use the model:

 ```python
-import numpy as np
 import torch
 from transformers import TimesFmModelForPrediction

--- a/docs/source/en/model_doc/voxtral.md
+++ b/docs/source/en/model_doc/voxtral.md
@ -1,351 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Voxtral
-
-Voxtral is an upgrade of [Ministral 3B and Mistral Small 3B](https://mistral.ai/news/ministraux), extending its language capabilities with audio input support. It is designed to handle tasks such as speech transcription, translation, and audio understanding.
-
-You can read more in Mistral's [realease blog post](https://mistral.ai/news/voxtral).
-
-The model is available in two checkpoints:
- 3B: [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)
- 24B: [mistralai/Voxtral-Small-24B-2507](https://huggingface.co/mistralai/Voxtral-Small-24B-2507)
-
-## Key Features
-
-Voxtral builds on Ministral-3B by adding audio processing capabilities:
-
- **Transcription mode**: Includes a dedicated mode for speech transcription. By default, Voxtral detects the spoken language and transcribes it accordingly.  
- **Long-form context**: With a 32k token context window, Voxtral can process up to 30 minutes of audio for transcription or 40 minutes for broader audio understanding.  
- **Integrated Q&A and summarization**: Supports querying audio directly and producing structured summaries without relying on separate ASR and language models.  
- **Multilingual support**: Automatically detects language and performs well across several widely spoken languages, including English, Spanish, French, Portuguese, Hindi, German, Dutch, and Italian.  
- **Function calling via voice**: Can trigger functions or workflows directly from spoken input based on detected user intent.  
- **Text capabilities**: Maintains the strong text processing performance of its Ministral-3B foundation.
-
-## Usage
-
-### Audio Instruct Mode
-
-The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
-
-➡️ audio + text instruction
-```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "audio",
-                "url": "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav",
-            },
-            {"type": "text", "text": "What can you tell me about this audio?"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(conversation)
-inputs = inputs.to(device, dtype=torch.bfloat16)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-
-print("\nGenerated response:")
-print("=" * 80)
-print(decoded_outputs[0])
-print("=" * 80)
-```
-
-➡️ multi-audio + text instruction 
-```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "audio",
-                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/mary_had_lamb.mp3",
-            },
-            {
-                "type": "audio",
-                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
-            },
-            {"type": "text", "text": "What sport and what nursery rhyme are referenced?"},
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(conversation)
-inputs = inputs.to(device, dtype=torch.bfloat16)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-
-print("\nGenerated response:")
-print("=" * 80)
-print(decoded_outputs[0])
-print("=" * 80)
-```
-
-➡️ multi-turn:
-```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "audio",
-                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
-            },
-            {
-                "type": "audio",
-                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
-            },
-            {"type": "text", "text": "Describe briefly what you can hear."},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": "The audio begins with the speaker delivering a farewell address in Chicago, reflecting on his eight years as president and expressing gratitude to the American people. The audio then transitions to a weather report, stating that it was 35 degrees in Barcelona the previous day, but the temperature would drop to minus 20 degrees the following day.",
-    },
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "audio",
-                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
-            },
-            {"type": "text", "text": "Ok, now compare this new audio with the previous one."},
-        ],
-    },
-]
-
-inputs = processor.apply_chat_template(conversation)
-inputs = inputs.to(device, dtype=torch.bfloat16)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-
-print("\nGenerated response:")
-print("=" * 80)
-print(decoded_outputs[0])
-print("=" * 80)
-```
-
-➡️ text only:
-```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What if a cyber brain could possibly generate its own ghost, and create a soul all by itself?",
-            },
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(conversation)
-inputs = inputs.to(device, dtype=torch.bfloat16)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-
-print("\nGenerated response:")
-print("=" * 80)
-print(decoded_outputs[0])
-print("=" * 80)
-```
-
-➡️ audio only:
-```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "audio",
-                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
-            },
-        ],
-    }
-]
-
-inputs = processor.apply_chat_template(conversation)
-inputs = inputs.to(device, dtype=torch.bfloat16)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-
-print("\nGenerated response:")
-print("=" * 80)
-print(decoded_outputs[0])
-print("=" * 80)
-```
-
-➡️ batched inference!
-```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-conversations = [
-    [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "audio",
-                    "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
-                },
-                {
-                    "type": "audio",
-                    "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
-                },
-                {
-                    "type": "text",
-                    "text": "Who's speaking in the speach and what city's weather is being discussed?",
-                },
-            ],
-        }
-    ],
-    [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "audio",
-                    "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
-                },
-                {"type": "text", "text": "What can you tell me about this audio?"},
-            ],
-        }
-    ],
-]
-
-inputs = processor.apply_chat_template(conversations)
-inputs = inputs.to(device, dtype=torch.bfloat16)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-
-print("\nGenerated responses:")
-print("=" * 80)
-for decoded_output in decoded_outputs:
-    print(decoded_output)
-    print("=" * 80)
-```
-
-### Transcription Mode
-
-Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
-
-```python
-from transformers import VoxtralForConditionalGeneration, AutoProcessor
-import torch
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-repo_id = "mistralai/Voxtral-Mini-3B-2507"
-
-processor = AutoProcessor.from_pretrained(repo_id)
-model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-
-inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
-inputs = inputs.to(device, dtype=torch.bfloat16)
-
-outputs = model.generate(**inputs, max_new_tokens=500)
-decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-
-print("\nGenerated responses:")
-print("=" * 80)
-for decoded_output in decoded_outputs:
-    print(decoded_output)
-    print("=" * 80)
-```
-
-This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
-
-## VoxtralConfig
-
-[[autodoc]] VoxtralConfig
-
-## VoxtralEncoderConfig
-
-[[autodoc]] VoxtralEncoderConfig
-
-## VoxtralProcessor
-
-[[autodoc]] VoxtralProcessor
-
-## VoxtralEncoder
-
-[[autodoc]] VoxtralEncoder
-    - forward
-
-## VoxtralForConditionalGeneration
-
-[[autodoc]] VoxtralForConditionalGeneration
-    - forward
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Cyril Vallez	1cf5a78ab8	fix	2025-07-09 19:23:29 +02:00
Cyril Vallez	e68eb2574a	gemma 3n	2025-07-09 19:10:52 +02:00
Cyril Vallez	680d66553f	fix init of the MambaCaches	2025-07-09 18:49:31 +02:00
Cyril Vallez	eaa373a5df	style	2025-07-09 18:49:31 +02:00
Cyril Vallez	6521a65cc1	remove docstrings and decorators in smolvlm	2025-07-09 18:49:31 +02:00
Cyril Vallez	268bed37ae	fix	2025-07-09 18:49:31 +02:00
Cyril Vallez	3fab9167b3	style	2025-07-09 18:49:31 +02:00
Cyril Vallez	21cb180caa	simplify	2025-07-09 18:49:31 +02:00
Cyril Vallez	dc9f8dad0a	fix	2025-07-09 18:49:31 +02:00
Cyril Vallez	59020bfc9f	fix	2025-07-09 18:49:31 +02:00
Cyril Vallez	51461c690a	style	2025-07-09 18:49:31 +02:00
Cyril Vallez	9fb4596487	Apply all fixes	2025-07-09 18:49:31 +02:00
Cyril Vallez	41db913ac3	apply changes	2025-07-09 18:49:31 +02:00
Cyril Vallez	494e437b56	fix warning	2025-07-09 18:49:31 +02:00
Cyril Vallez	1017a83fb6	check modular files as well	2025-07-09 18:49:31 +02:00
Cyril Vallez	f6c3136d18	style	2025-07-09 18:49:29 +02:00
Cyril Vallez	0ded085a57	fix some modular	2025-07-09 18:45:54 +02:00