feat: rework widgets

feat: add a ContinuousBatchingVisualizer
Better CI (#38552 )
2025-10-20 17:13:56 +08:00 · 2025-06-10 17:03:21 +02:00 · 2025-06-06 19:04:29 +02:00 · 2025-06-06 17:59:14 +02:00 · 2025-06-06 17:14:00 +02:00 · 2025-06-06 14:54:00 +00:00
1386 changed files with 89416 additions and 83892 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,6 +7,18 @@ parameters:
    nightly:
        type: boolean
        default: false
+    GHA_Actor:
+        type: string
+        default: ""
+    GHA_Action:
+        type: string
+        default: ""
+    GHA_Event:
+        type: string
+        default: ""
+    GHA_Meta:
+        type: string
+        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
@ -31,14 +43,6 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
-            - run: cat github.txt
-            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
-            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
-            - run: cat elapsed.txt
-            - run: if [ "$(cat elapsed.txt)" -lt "30" ]; then echo "PR is just opened, wait some actions from GitHub"; sleep 30; fi
-            - run: 'if grep -q "\"draft\": true," github.txt; then echo "draft mode, skip test!"; circleci-agent step halt; fi'
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
@ -108,8 +112,6 @@ jobs:

            - run:
                name: "Retrieve Artifact Paths"
-                env:
-                    CIRCLE_TOKEN: ${{ secrets.CI_ARTIFACT_TOKEN }}
                command: |
                    project_slug="gh/${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}"
                    job_number=${CIRCLE_BUILD_NUM}
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,6 +28,8 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
+    # will be adjust in `CircleCIJob.to_dict`.
+    "RUN_FLAKY": True,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
 COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
@ -126,6 +128,8 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
+        # Do not run tests decorated by @is_flaky on pull requests
+        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
@ -209,7 +213,7 @@ generate_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    # networkx==3.3 (after #36957) cause some issues
    # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
+    install_steps=["uv venv && uv pip install ."],
    marker="generate",
    parallelism=6,
 )
@ -305,7 +309,7 @@ onnx_job = CircleCIJob(
    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
    install_steps=[
        "uv venv",
-        "uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
+        "uv pip install .[testing,sentencepiece,onnxruntime,vision,rjieba]",
    ],
    pytest_options={"k onnx": None},
    pytest_num_workers=1,
@ -334,7 +338,7 @@ non_model_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    # networkx==3.3 (after #36957) cause some issues
    # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
+    install_steps=["uv venv && uv pip install ."],
    marker="not generate",
    parallelism=6,
 )
@ -393,7 +397,12 @@ def create_circleci_config(folder=None):
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": ''},
+            # Only used to accept the parameters from GitHub Actions trigger
+            "GHA_Actor": {"type": "string", "default": ""},
+            "GHA_Action": {"type": "string", "default": ""},
+            "GHA_Event": {"type": "string", "default": ""},
+            "GHA_Meta": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ""},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -64,7 +64,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/benchmarks_entrypoint.py "$BRANCH_NAME" "$commit_id" "$commit_msg"
+          python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          # Enable this to see debug logs
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -19,7 +19,7 @@ concurrency:

 jobs:
  latest-docker:
-    name: "Latest PyTorch + TensorFlow [dev]"
+    name: "Latest PyTorch [dev]"
    runs-on:
      group: aws-general-8-plus
    steps:
@ -267,44 +267,6 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  latest-tensorflow:
-    name: "Latest TensorFlow [dev]"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on:
-      group: aws-general-8-plus
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v4
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker/transformers-tensorflow-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-tensorflow-gpu
-
-      - name: Post to Slack
-        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
-          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-
  latest-pytorch-deepspeed-amd:
    name: "PyTorch + DeepSpeed (AMD) [dev]"
    runs-on:
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -2,6 +2,15 @@ name: Build PR Documentation

 on:
  pull_request:
+  workflow_call:
+    inputs:
+      pr_number:
+        type: string
+        required: true
+      commit_sha:
+        type: string
+        required: true
+

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@ -9,9 +18,9 @@ concurrency:

 jobs:
  build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@6e2eb04a2604817c97be03786efa494fe3acae90
    with:
-      commit_sha: ${{ github.event.pull_request.head.sha }}
-      pr_number: ${{ github.event.number }}
+      commit_sha: ${{ inputs.commit_sha || github.event.pull_request.head.sha }}
+      pr_number: ${{ inputs.pr_number || github.event.number }}
      package: transformers
      languages: en
--- a/.github/workflows/change_pr_to_draft.yml
+++ b/.github/workflows/change_pr_to_draft.yml
@ -1,25 +0,0 @@
-name: Change PR to draft
-
-on:
-  pull_request_target:
-    types: [opened, reopened]
-
-jobs:
-  convert_pr_to_draft:
-    runs-on: ubuntu-22.04
-    name: Convert PR to draft
-    permissions:
-      pull-requests: write
-      contents: write
-    if: github.event.pull_request.draft == false
-    steps:
-      - name: Convert PR to draft
-        shell: bash
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO: ${{ github.repository }}
-        run: |
-          echo $PR_NUMBER
-          gh pr ready $PR_NUMBER --repo $REPO --undo
-          gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. The CI will be paused while the PR is in draft mode. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page). This will assign reviewers and trigger CI."
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -9,6 +9,18 @@ on:
      start_sha:
        required: true
        type: string
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
+      ci_event:
+        required: true
+        type: string
+      report_repo_id:
+        required: true
+        type: string


 env:
@ -26,77 +38,128 @@ env:


 jobs:
-  run_models_gpu:
+  check_new_failures:
    name: " "
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - uses: actions/download-artifact@v4
        with:
-          name: ci_results_run_models_gpu
-          path: /transformers/ci_results_run_models_gpu
+          name: ci_results_${{ inputs.job }}
+          path: /transformers/ci_results_${{ inputs.job }}
+
+      - name: Check file
+        working-directory: /transformers
+        run: |
+          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
+            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
+            echo "process=true" >> $GITHUB_ENV
+          else
+            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
+            echo "process=false" >> $GITHUB_ENV
+          fi
+
+      - uses: actions/download-artifact@v4
+        if: ${{ env.process == 'true' }}
+        with:
+          pattern: setup_values*
+          path: setup_values
+          merge-multiple: true
+
+      - name: Prepare some setup values
+        if: ${{ env.process == 'true' }}
+        run: |
+          if [ -f setup_values/prev_workflow_run_id.txt ]; then
+            echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV
+          else
+            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
+          fi
+
+          if [ -f setup_values/other_workflow_run_id.txt ]; then
+            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
+          else
+            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
+          fi

      - name: Update clone
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: git fetch && git checkout ${{ github.sha }}

      - name: Get target commit
        working-directory: /transformers/utils
+        if: ${{ env.process == 'true' }}
        run: |
-          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"]); print(commit)')" >> $GITHUB_ENV
+          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV

      - name: Checkout to `start_sha`
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: git fetch && git checkout ${{ inputs.start_sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

      - name: NVIDIA-SMI
+        if: ${{ env.process == 'true' }}
        run: |
          nvidia-smi

      - name: Environment
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: |
          python3 utils/print_env.py

      - name: Show installed libraries and their versions
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: pip freeze

      - name: Check failed tests
        working-directory: /transformers
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json
+        if: ${{ env.process == 'true' }}
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json

      - name: Show results
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: |
-          ls -l new_model_failures_with_bad_commit.json
-          cat new_model_failures_with_bad_commit.json
+          ls -l new_failures_with_bad_commit.json
+          cat new_failures_with_bad_commit.json

      - name: Checkout back
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: |
          git checkout ${{ inputs.start_sha }}

      - name: Process report
        shell: bash
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        env:
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+          JOB_NAME: ${{ inputs.job }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        run: |
          python3 utils/process_bad_commit_report.py

      - name: Process report
        shell: bash
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        env:
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+          JOB_NAME: ${{ inputs.job }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        run: |
          {
            echo 'REPORT_TEXT<<EOF'
@ -104,17 +167,31 @@ jobs:
            echo EOF
          } >> "$GITHUB_ENV"

+      - name: Prepare Slack report title
+        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
+        run: |
+          pip install slack_sdk
+          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
+
      - name: Send processed report
-        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
+        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
        with:
          # Slack channel id, channel name, or user id to post message.
          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: '#transformers-ci-feedback-tests'
+          channel-id: '#${{ inputs.slack_report_channel }}'
          # For posting a rich message using Block Kit
          payload: |
            {
              "blocks": [
+                {
+                  "type": "header",
+                  "text": {
+                    "type": "plain_text",
+                    "text": "${{ env.title }}"
+                  }
+                },
                {
                  "type": "section",
                  "text": {
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -107,7 +107,7 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -59,7 +59,7 @@ jobs:
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
                  }
                }
              ]
--- a/.github/workflows/pr-style-bot.yml
+++ b/.github/workflows/pr-style-bot.yml
@ -0,0 +1,34 @@
+# To run this bot, comment "@bot /style" on a PR
+name: Style Bot
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  style:
+    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@639ee721e149a281fe726a50a2cc1354b48bc463
+    with:
+      python_quality_dependencies: "[quality]"
+      style_command_type: "default"
+    secrets:
+      bot_token: ${{ secrets.GITHUB_TOKEN }}
+
+  check-outputs:
+    runs-on: ubuntu-latest
+    needs: style
+    steps:
+      - run: echo ${{ needs.style.outputs.pr_number }}
+      - run: echo ${{ needs.style.outputs.new_commit_sha }}
+
+  trigger:
+    needs: style
+    if: needs.style.outputs.new_commit_sha != ''
+    uses: "./.github/workflows/build_pr_documentation.yml"
+    with:
+      pr_number: ${{ needs.style.outputs.pr_number }}
+      commit_sha: ${{ needs.style.outputs.new_commit_sha }}
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -145,7 +145,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,7 +239,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,7 +338,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@ -1,55 +0,0 @@
-name: Self-hosted runner (AMD mi210 scheduled CI caller)
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
-
-jobs:
-  model-ci:
-    name: Model CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
-      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
-      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@ -15,10 +15,11 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_models_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  torch-pipeline:
@ -26,10 +27,11 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_pipelines_torch_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  example-ci:
@ -37,10 +39,11 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_examples_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit

  deepspeed-ci:
@ -48,8 +51,9 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
+      report_repo_id: optimum-amd/transformers_daily_ci
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi300-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi300-caller.yml
@ -0,0 +1,63 @@
+name: Self-hosted runner scale set (AMD mi300 scheduled CI caller)
+
+# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
+# For example, 1gpu scale set: amd-mi300-ci-1gpu
+#              2gpu scale set: amd-mi300-ci-2gpu
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_scheduled_ci_caller*
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi300-ci
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi300
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi300-ci
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi300
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi300-ci
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi300
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#amd-hf-ci"
+      runner_scale_set: amd-mi300-ci
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi300
+      report_repo_id: optimum-amd/transformers_daily_ci
+    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -8,8 +8,43 @@ on:
  push:
    branches:
      - run_scheduled_ci*
+  workflow_dispatch:
+    inputs:
+      prev_workflow_run_id:
+        description: 'previous workflow run id to compare'
+        type: string
+        required: false
+        default: ""
+      other_workflow_run_id:
+        description: 'other workflow run id to compare'
+        type: string
+        required: false
+        default: ""
+
+
+# Used for `push` to easily modiffy the target workflow runs to compare against
+env:
+    prev_workflow_run_id: ""
+    other_workflow_run_id: ""
+

 jobs:
+  setup:
+    name: Setup
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Setup
+        run: |
+          mkdir "setup_values"
+          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
+          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: setup_values
+          path: setup_values
+
  model-ci:
    name: Model CI
    uses: ./.github/workflows/self-scheduled.yml
@ -19,6 +54,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  torch-pipeline:
@ -30,17 +66,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-pytorch-gpu
      ci_event: Daily CI
-    secrets: inherit
-
-  tf-pipeline:
-    name: TF pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_tf_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
-      runner: daily-ci
-      docker: huggingface/transformers-tensorflow-gpu
-      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  example-ci:
@ -52,6 +78,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  trainer-fsdp-ci:
@ -63,6 +90,7 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  deepspeed-ci:
@ -75,6 +103,7 @@ jobs:
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
      working-directory-prefix: /workspace
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit

  quantization-ci:
@ -86,4 +115,5 @@ jobs:
      runner: daily-ci
      docker: huggingface/transformers-quantization-latest-gpu
      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -28,6 +28,10 @@ on:
        default: ''
        required: false
        type: string
+      report_repo_id:
+        required: true
+        type: string
+

 env:
  HF_HOME: /mnt/cache
@ -49,7 +53,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -107,7 +111,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -125,7 +129,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -143,7 +147,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -177,7 +181,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -205,82 +209,13 @@ jobs:
          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
          path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports

-  run_pipelines_tf_gpu:
-    if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
-    name: TensorFlow pipelines
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
-    runs-on:
-      group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-tensorflow-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: |
-          git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Run all pipeline tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: |
-          cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
-
  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
    name: Examples directory
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -314,7 +249,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -349,7 +284,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -411,7 +346,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -448,7 +383,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -491,7 +426,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -567,7 +502,6 @@ jobs:
      run_models_gpu,
      run_trainer_and_fsdp_gpu,
      run_pipelines_torch_gpu,
-      run_pipelines_tf_gpu,
      run_examples_gpu,
      run_torch_cuda_extensions_gpu,
      run_quantization_torch_gpu,
@ -584,15 +518,21 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
      ci_event: ${{ inputs.ci_event }}
+      report_repo_id: ${{ inputs.report_repo_id }}

    secrets: inherit

-  check_new_model_failures:
-    if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }}
-    name: Check new model failures
+  check_new_failures:
+    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
+    name: Check new failures
    needs: send_results
-    uses: ./.github/workflows/check_failed_model_tests.yml
+    uses: ./.github/workflows/check_failed_tests.yml
    with:
      docker: ${{ inputs.docker }}
      start_sha: ${{ github.sha }}
+      job: ${{ inputs.job }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      ci_event: ${{ inputs.ci_event }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+
    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -21,6 +21,9 @@ on:
      ci_event:
        required: true
        type: string
+      report_repo_id:
+        required: true
+        type: string

 env:
  TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -39,8 +42,23 @@ jobs:

      - uses: actions/checkout@v4
      - uses: actions/download-artifact@v4
+
+      - name: Prepare some setup values
+        run: |
+          if [ -f setup_values/prev_workflow_run_id.txt ]; then
+            echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV
+          else
+            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
+          fi
+
+          if [ -f setup_values/other_workflow_run_id.txt ]; then
+            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
+          else
+            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
+          fi
+
      - name: Send message to Slack
-        if: ${{ inputs.job != 'run_quantization_torch_gpu' }}
+        shell: bash
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
@ -50,19 +68,22 @@ jobs:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: ${{ inputs.ci_event }}
          CI_SHA: ${{ github.sha }}
-          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
        # empty string, and the called script still get one argument (which is the emtpy string).
        run: |
-          sudo apt-get install -y curl
          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
-          python utils/notification_service.py "${{ inputs.folder_slices }}"
+          if [ "${{ inputs.quantization_matrix }}" != "" ]; then
+            python utils/notification_service.py "${{ inputs.quantization_matrix }}"
+          else
+            python utils/notification_service.py "${{ inputs.folder_slices }}"
+          fi          

      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
      - name: Failure table artifacts
@ -70,32 +91,3 @@ jobs:
        with:
          name: ci_results_${{ inputs.job }}
          path: ci_results_${{ inputs.job }}
-
-      - uses: actions/checkout@v4
-      - uses: actions/download-artifact@v4
-      - name: Send message to Slack for quantization workflow
-        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
-          CI_EVENT: ${{ inputs.ci_event }}
-          CI_SHA: ${{ github.sha }}
-          CI_TEST_JOB: ${{ inputs.job }}
-          SETUP_STATUS: ${{ inputs.setup_status }}
-        # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
-        # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
-        run: |
-          sudo apt-get install -y curl
-          pip install huggingface_hub
-          pip install slack_sdk
-          pip show slack_sdk
-          python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}"
-
-      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
-      - name: Failure table artifacts
-        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ci_results_${{ inputs.job }}
-          path: ci_results_${{ inputs.job }}
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -35,7 +35,7 @@ jobs:
        shell: bash
        run: |
          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers-cli env
+transformers env
 ```

 You can also run the same command from the root of the repository:
--- a/2
+++ b/2
@ -79,7 +79,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py  --fix_and_overwrite
+	python utils/check_modular_conversion.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
--- a/README.md
+++ b/README.md
@ -78,7 +78,6 @@ Create and activate a virtual environment with [venv](https://docs.python.org/3/
 # venv
 python -m venv .my-env
 source .my-env/bin/activate
-
 # uv
 uv venv .my-env
 source .my-env/bin/activate
@ -88,10 +87,10 @@ Install Transformers in your virtual environment.

 ```py
 # pip
-pip install transformers
+pip install "transformers[torch]"

 # uv
-uv pip install transformers
+uv pip install "transformers[torch]"
 ```

 Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
@ -99,7 +98,12 @@ Install Transformers from source if you want the latest changes in the library o
 ```shell
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .
+
+# pip
+pip install .[torch]
+
+# uv
+uv pip install .[torch]
 ```

 ## Quickstart
@ -121,7 +125,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
 > [!TIP]
 > You can also chat with a model directly from the command line.
 > ```shell
-> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+> transformers chat Qwen/Qwen2.5-0.5B-Instruct
 > ```

 ```py
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -2,11 +2,11 @@ import argparse
 import importlib.util
 import logging
 import os
-from typing import Dict
 import sys
+from typing import Dict, Tuple

-from psycopg2.extras import Json
 from psycopg2.extensions import register_adapter
+from psycopg2.extras import Json


 register_adapter(dict, Json)
@ -17,10 +17,13 @@ class ImportModuleException(Exception):


 class MetricsRecorder:
-    def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str):
+    def __init__(
+        self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str
+    ):
        self.conn = connection
        self.conn.autocommit = True
        self.logger = logger
+        self.repository = repository
        self.branch = branch
        self.commit_id = commit_id
        self.commit_msg = commit_msg
@ -32,8 +35,8 @@ class MetricsRecorder:
        # gpu_name: str, model_id: str
        with self.conn.cursor() as cur:
            cur.execute(
-                "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
-                (self.branch, self.commit_id, self.commit_msg, metadata),
+                "INSERT INTO benchmarks (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
+                (self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
            )
            benchmark_id = cur.fetchone()[0]
            logger.debug(f"initialised benchmark #{benchmark_id}")
@ -82,12 +85,18 @@ handler.setFormatter(formatter)
 logger.addHandler(handler)


-def parse_arguments():
+def parse_arguments() -> Tuple[str, str, str, str]:
    """
    Parse command line arguments for the benchmarking CLI.
    """
    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")

+    parser.add_argument(
+        "repository",
+        type=str,
+        help="The repository name on which the benchmarking is performed.",
+    )
+
    parser.add_argument(
        "branch",
        type=str,
@ -108,7 +117,7 @@ def parse_arguments():

    args = parser.parse_args()

-    return args.branch, args.commit_id, args.commit_msg
+    return args.repository, args.branch, args.commit_id, args.commit_msg


 def import_from_path(module_name, file_path):
@ -125,7 +134,7 @@ def import_from_path(module_name, file_path):
 if __name__ == "__main__":
    benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))

-    branch, commit_id, commit_msg = parse_arguments()
+    repository, branch, commit_id, commit_msg = parse_arguments()

    for entry in os.scandir(benchmarks_folder_path):
        try:
@ -136,7 +145,7 @@ if __name__ == "__main__":
            logger.debug(f"loading: {entry.name}")
            module = import_from_path(entry.name.split(".")[0], entry.path)
            logger.info(f"running benchmarks in: {entry.name}")
-            module.run_benchmark(logger, branch, commit_id, commit_msg)
+            module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
        except ImportModuleException as e:
            logger.error(e)
        except Exception as e:
--- a/benchmark/init_db.sql
+++ b/benchmark/init_db.sql
@ -1,5 +1,6 @@
 CREATE TABLE IF NOT EXISTS benchmarks (
  benchmark_id SERIAL PRIMARY KEY,
+  repository VARCHAR(255),
  branch VARCHAR(255),
  commit_id VARCHAR(72),
  commit_message VARCHAR(70),
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -33,11 +33,15 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
        sleep(0.01)


-def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+def run_benchmark(
+    logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100
+):
    continue_metric_collection = Event()
    metrics_thread = None
    model_id = "meta-llama/Llama-2-7b-hf"
-    metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
+    metrics_recorder = MetricsRecorder(
+        psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg
+    )
    try:
        gpu_stats = gpustat.GPUStatCollection.new_query()
        gpu_name = gpu_stats[0]["name"]
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -16,7 +16,7 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,7 +7,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -28,7 +28,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability

 RUN python3 -m pip uninstall -y flax jax

@ -45,7 +45,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum

 # For video model testing
-RUN python3 -m pip install --no-cache-dir av==9.2.0
+RUN python3 -m pip install --no-cache-dir av

 # Some slow tests require bnb
 RUN python3 -m pip install --no-cache-dir bitsandbytes
@ -71,6 +71,12 @@ RUN python3 -m pip install --no-cache-dir g2p-en
 # For Some bitsandbytes tests
 RUN python3 -m pip install --no-cache-dir einops

+# For Some tests with `@require_liger_kernel`
+RUN python3 -m pip install --no-cache-dir liger-kernel
+
+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM rocm/dev-ubuntu-22.04:6.2.4
+FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -11,9 +11,6 @@ RUN apt update && \
 RUN git lfs install

 RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
-
-RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4
-
 RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"

 ARG REF=main
@ -33,3 +30,6 @@ RUN cd transformers && python3 setup.py develop

 # Remove nvml and nvidia-ml-py as it is not compatible with ROCm. apex is not tested on NVIDIA either.
 RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
+
+# `kernels` may causes many failing tests
+RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -48,3 +48,6 @@ RUN python3 -c "from deepspeed.launcher.runner import main"

 # Remove nvml as it is not compatible with ROCm
 RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
+
+# `kernels` may causes many failing tests
+RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -45,6 +45,9 @@ RUN python3 -m pip uninstall -y deepspeed
 # TODO: Find out why test fail.
 RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -57,6 +57,9 @@ RUN python3 -m pip uninstall -y deepspeed
 #RUN git clone https://github.com/pytorch/TensorRT.git
 #RUN cd TensorRT/py && python3 setup.py install --fx-only

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -28,6 +28,9 @@ RUN python3 -m pip uninstall -y tensorflow flax
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -90,6 +90,9 @@ RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
 # Add transformers in editable mode
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -95,7 +95,7 @@ wie der Code geschrieben werden sollte :-)
 1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from).
 2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
@ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d
 ein bestehendes Modell:

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb
 Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:

 ```bash
-transformers-cli env
+transformers env
 ```

 Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -21,6 +21,8 @@
      title: Adding a new model to Transformers
    - local: modular_transformers
      title: Modular Transformers
+    - local: auto_docstring
+      title: Document your models
    - local: task_summary
      title: What 🤗 Transformers can do
    - local: tasks_explained
@ -37,6 +39,8 @@
      title: Tokenizers
    - local: image_processors
      title: Image processors
+    - local: video_processors
+      title: Video processors
    - local: backbones
      title: Backbones
    - local: feature_extractors
@ -72,12 +76,12 @@
      title: Prompt engineering
    - local: llm_optims
      title: Optimizing inference
+    - local: cache_explanation
+      title: Caching
    - local: kv_cache
      title: KV cache strategies
    - local: serving
      title: Serving
-    - local: cache_explanation
-      title: Caching
    - local: llm_tutorial_optimization
      title: Getting the most out of LLMs
    - local: perplexity
@ -125,8 +129,8 @@
      title: Hyperparameter search
    title: Trainer API
  - sections:
-    - local: gpu_selection
-      title: GPU selection
+    - local: accelerator_selection
+      title: Accelerator selection
    - local: accelerate
      title: Accelerate
    - local: fsdp
@ -149,6 +153,8 @@
      title: TPU
    - local: perf_train_special
      title: Apple Silicon
+    - local: perf_train_gaudi
+      title: Intel Gaudi
    - local: perf_hardware
      title: Build your own machine
    title: Hardware
@ -358,7 +364,9 @@
      title: Feature Extractor
    - local: main_classes/image_processor
      title: Image Processor
-    title: Main classes
+    - local: main_classes/video_processor
+      title: Video Processor
+    title: Main Classes
  - sections:
    - sections:
      - local: model_doc/albert
@ -378,7 +386,7 @@
      - local: model_doc/bert-japanese
        title: BertJapanese
      - local: model_doc/bertweet
-        title: Bertweet
+        title: BERTweet
      - local: model_doc/big_bird
        title: BigBird
      - local: model_doc/bigbird_pegasus
@ -447,6 +455,8 @@
        title: Falcon
      - local: model_doc/falcon3
        title: Falcon3
+      - local: model_doc/falcon_h1
+        title: FalconH1
      - local: model_doc/falcon_mamba
        title: FalconMamba
      - local: model_doc/flan-t5
@ -493,6 +503,8 @@
        title: Granite
      - local: model_doc/granitemoe
        title: GraniteMoe
+      - local: model_doc/granitemoehybrid
+        title: GraniteMoeHybrid
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
      - local: model_doc/helium
@ -530,7 +542,7 @@
      - local: model_doc/mamba
        title: Mamba
      - local: model_doc/mamba2
-        title: mamba2
+        title: Mamba2
      - local: model_doc/marian
        title: MarianMT
      - local: model_doc/markuplm
@ -543,6 +555,8 @@
        title: MegatronBERT
      - local: model_doc/megatron_gpt2
        title: MegatronGPT2
+      - local: model_doc/minimax
+        title: MiniMax
      - local: model_doc/mistral
        title: Mistral
      - local: model_doc/mixtral
@ -821,6 +835,8 @@
        title: Bark
      - local: model_doc/clap
        title: CLAP
+      - local: model_doc/csm
+        title: CSM
      - local: model_doc/dac
        title: dac
      - local: model_doc/encodec
@ -923,6 +939,8 @@
        title: CLVP
      - local: model_doc/colpali
        title: ColPali
+      - local: model_doc/colqwen2
+        title: ColQwen2
      - local: model_doc/data2vec
        title: Data2Vec
      - local: model_doc/deplot
@ -1107,4 +1125,9 @@
    - local: internal/time_series_utils
      title: Utilities for Time Series
    title: Internal helpers
+  - sections:
+    - local: reference/environment_variables
+      title: Environment Variables
+    title: Reference
  title: API
+
--- a/docs/source/en/accelerator_selection.md
+++ b/docs/source/en/accelerator_selection.md
@ -0,0 +1,126 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Accelerator selection
+
+During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
+
+This guide will show you how to select the number of accelerators to use and the order to use them in.
+
+## Number of accelerators
+
+For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
+
+<hfoptions id="select-accelerator">
+<hfoption id="torchrun">
+
+Use the `--nproc_per_node` to select how many accelerators to use.
+
+```bash
+torchrun --nproc_per_node=2  trainer-program.py ...
+```
+
+</hfoption>
+<hfoption id="Accelerate">
+
+Use `--num_processes` to select how many accelerators to use.
+
+```bash
+accelerate launch --num_processes 2 trainer-program.py ...
+```
+
+</hfoption>
+<hfoption id="DeepSpeed">
+
+Use `--num_gpus` to select how many GPUs to use.
+
+```bash
+deepspeed --num_gpus 2 trainer-program.py ...
+```
+
+</hfoption>
+</hfoptions>
+
+## Order of accelerators
+To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
+
+For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
+
+<hfoptions id="accelerator-type">
+<hfoption id="CUDA">
+
+```bash
+CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
+```
+
+Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.  
+To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
+
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
+```
+
+To run without any GPUs:
+
+```bash
+CUDA_VISIBLE_DEVICES= python trainer-program.py ...
+```
+
+You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
+
+- Order by PCIe bus ID (matches `nvidia-smi`):
+
+    ```bash
+    export CUDA_DEVICE_ORDER=PCI_BUS_ID
+    ```
+
+- Order by compute capability (fastest first):
+
+    ```bash
+    export CUDA_DEVICE_ORDER=FASTEST_FIRST
+    ```
+
+</hfoption>
+<hfoption id="Intel XPU">
+
+```bash
+ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
+```
+
+Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.  
+To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
+
+```bash
+ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
+```
+
+
+You can also control the order of Intel XPUs with:
+
+```bash
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+```
+
+For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
+
+</hfoption>
+</hfoptions>
+
+
+
+> [!WARNING]
+> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -161,7 +161,7 @@ The downside is that if you aren't used to them, it may take some time to get us
 Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 ## Create a pull request
@ -292,7 +292,7 @@ Once you're able to run the original checkpoint, you're ready to start adapting

 ## Adapt the model code

-The `transformers-cli add-new-model-like` command should have generated a model and configuration file.
+The `transformers add-new-model-like` command should have generated a model and configuration file.

 - `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
 - `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
@ -551,10 +551,10 @@ While this example doesn't include an image processor, you may need to implement

 If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.

-Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
+Run the following command (only if you haven't already created the fast image processor with the `transformers add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.

 ```bash
-transformers-cli add-fast-image-processor --model-name your_model_name
+transformers add-fast-image-processor --model-name your_model_name
 ```

 This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -108,7 +108,7 @@ If in doubt about what args/kwargs a given model sends to the attention function
 ## Accessing current available implementations

 Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
-and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
 would expect from a usual Python dictionary:

 ```python
@ -125,4 +125,44 @@ would expect from a usual Python dictionary:

 # You can also globally `register` a new function directly on it
 >>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
-```
+```
+
+## Attention Mask Interface
+
+Having a new attention function may mean that you need a new format of attention mask to decide what key and value tokens
+the query tokens should attend to. This is now possible with the `AttentionMaskInterface`! It works in the same way as
+the `AttentionInterface`:
+
+```python
+from transformers import AttentionMaskInterface
+from transformers.masking_utils import sdpa_mask
+import torch
+
+def my_new_sdpa_mask(*args, **kwargs):
+    print("I just entered the attention mask computation")
+    return sdpa_mask(*args, **kwargs)
+
+AttentionMaskInterface.register("my_new_sdpa_mask", my_new_sdpa_mask)
+```
+
+The reason you have to register it is because we need to automatically correct your mask format based on the attention implementation (for example, flex attention uses a BlockMask format, while sdpa uses a 4D tensor).
+By default, if you do not register an attention mask function along with your attention function, mask creation will be skipped
+and `attention_mask=None` will be passed along to the Attention layers.
+
+The default signature of the attention mask functions is the following:
+
+```python
+def custom_attention_mask(
+    batch_size: int,  # required arg
+    cache_position: torch.Tensor,  # required arg
+    kv_length: int,  # required arg
+    kv_offset: int = 0,  # required arg
+    mask_function: Callable = causal_mask_function,  # required arg
+    attention_mask: Optional[torch.Tensor] = None,  # required arg
+    **kwargs,  # a few additional args may be passed as kwargs, especially the model's config is always passed
+) -> Optional[torch.Tensor]:
+```
+
+It mostly works thanks to the `mask_function`, which is a `Callable` in the form of [torch's mask_mod functions](https://pytorch.org/blog/flexattention/), taking 4 indices as input and returning a boolean to indicate if this position should take part in the attention computation.
+
+If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py).
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -0,0 +1,279 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilizing the @auto_docstring Decorator
+
+The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
+
+---
+
+## 📜 How it Works
+
+The `@auto_docstring` decorator constructs docstrings by:
+
+1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
+2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
+3.  **Overriding or Adding Arguments Descriptions:**
+    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
+    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+4.  **Adding Classes and Functions Introduction:**
+    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
+    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
+5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
+6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
+7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
+8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
+
+
+---
+
+## 🚀 How to Use @auto_docstring
+
+### 1. Importing the Decorator
+Import the decorator into your modeling file:
+
+```python
+from ...utils import auto_docstring
+```
+
+### 2. Applying to Classes
+Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
+
+```python
+from transformers.modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+
+@auto_docstring
+class MyAwesomeModel(PreTrainedModel):
+    def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
+        r"""
+        custom_parameter (`int`, *optional*, defaults to 10):
+            Description of the custom_parameter for MyAwesomeModel.
+        another_custom_arg (`str`, *optional*, defaults to "default"):
+            Documentation for another unique argument.
+        """
+        super().__init__(config)
+        self.custom_parameter = custom_parameter
+        self.another_custom_arg = another_custom_arg
+        # ... rest of your init
+
+    # ... other methods
+```
+
+#### Advanced Class Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+    custom_args="""
+    custom_parameter (`type`, *optional*, defaults to `default_value`):
+        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+    """
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        # ...
+```
+
+Or:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        r"""
+        custom_parameter (`type`, *optional*, defaults to `default_value`):
+            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+        """
+        # ...
+```
+
+### 3. Applying to Functions (e.g., `forward` method)
+Apply the decorator above method definitions, such as the `forward` method.
+
+```python
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        new_custom_argument: Optional[torch.Tensor] = None,
+        arg_documented_in_args_doc: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
+        r"""
+        new_custom_argument (`torch.Tensor`, *optional*):
+            Description of this new custom argument and its expected shape or type.
+        """
+        # ...
+```
+
+#### Advanced Function Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
+
+```python
+MODEL_COMMON_CUSTOM_ARGS = r"""
+    common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_1
+    common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_2
+    ...
+"""
+
+class MyModel(PreTrainedModel):
+    # ...
+    @auto_docstring(
+        custom_intro="""
+        This is a custom introduction for the function.
+        """
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        common_arg_1: Optional[torch.Tensor] = None,
+        common_arg_2: Optional[torch.Tensor] = None,
+        #...
+        function_specific_argument: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> torch.Tensor:
+        r"""
+        function_specific_argument (`torch.Tensor`, *optional*):
+            Description of an argument specific to this function
+
+        Returns:
+            `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
+
+        Example:
+
+        (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
+
+        ```python
+        ...
+        ```
+        """
+        # ...
+```
+
+---
+
+### ✍️ Documenting Arguments: Approach & Priority
+
+1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
+    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
+
+2.  **New or Custom Arguments:**
+    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
+    * **Format:**
+        ```
+        argument_name (`type`, *optional*, defaults to `X`):
+            Description of the argument.
+            Explain its purpose, expected shape/type if complex, and default behavior.
+            This can span multiple lines.
+        ```
+    * Include `type` in backticks.
+    * Add "*optional*" if the argument is not required (has a default value).
+    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
+
+3.  **Overriding Standard Arguments:**
+    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
+    * The `labels` argument is often customized per model and typically requires a specific docstring.
+
+4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
+    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+
+---
+
+### Usage with [modular files](./modular_transformers)
+
+When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
+
+- **For standalone models in modular files:**
+  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
+
+- **For models inheriting from other library models:**
+  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
+  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
+
+  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
+
+
+**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
+
+---
+
+## ✅ Checking Your Docstrings with `check_auto_docstrings`
+
+The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
+
+#### What it Checks:
+
+* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
+* **Argument Completeness & Consistency:**
+    * Flags arguments in the signature that are not known standard arguments and lack a local description.
+    * Ensures documented arguments exist in the signature. (TODO)
+    * Verifies that types and default values in the docstring match the signature. (TODO)
+* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
+* **Formatting:** Adherence to the expected docstring style.
+
+#### Running the Check Locally:
+
+Run this check locally before committing. The common command is:
+
+```bash
+make fix-copies
+```
+
+Alternatively, to only perform docstrings and auto-docstring checks, you can use:
+
+```bash
+python utils/check_docstrings.py # to only check files included in the diff without fixing them
+# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
+# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
+```
+
+#### Workflow with the Checker:
+
+1.  Add `@auto_docstring(...)` to the class or method.
+2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
+3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
+    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
+4.  Manually edit these placeholders with accurate types and descriptions.
+5.  Re-run the check to ensure all issues are resolved.
+
+---
+
+## 🔑 Key Takeaways & Best Practices
+
+* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
+* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
+* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
+* Document new or custom arguments clearly.
+* Run `check_docstrings` locally and iteratively.
+
+By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -15,8 +15,7 @@ rendered properly in your Markdown viewer.
 -->

 # Caching
-
-Imagine you’re having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
+Imagine you're having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?

 You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.

@ -29,8 +28,50 @@ A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived
 > [!WARNING]
 > Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training.

+To better understand how and why caching works, let's take a closer look at the structure of the attention matrices.
+
+## Attention matrices
+
+The **scaled dot-product attention** is calculated as shown below for a batch of size `b`, number of attention heads `h`, sequence length so far `T`, and dimension per attention head `d_head`.
+
+$$
+\text{Attention}(Q, K, V) = \text{softmax}\left( \frac{Q K^\top}{\sqrt{d_{\text{head}}}} \times \text{mask} \right) V
+$$
+
+The query (`Q`), key (`K`), and value (`V`) matrices are projections from the input embeddings of shape `(b, h, T, d_head)`.
+
+For causal attention, the mask prevents the model from attending to future tokens. Once a token is processed, its representation never changes with respect to future tokens, which means \\( K_{\text{past}} \\) and \\( V_{\text{past}} \\) can be cached and reused to compute the last token's representation.
+
+$$
+\text{Attention}(q_t, [\underbrace{k_1, k_2, \dots, k_{t-1}}_{\text{cached}}, k_{t}], [\underbrace{v_1, v_2, \dots, v_{t-1}}_{\text{cached}}, v_{t}])
+$$
+
+At inference time, you only need the last token's query to compute the representation \\( x_t \\) that predicts the next token \\( t+1 \\). At each step, the new key and value vectors are **stored** in the cache and **appended** to the past keys and values.
+
+$$
+K_{\text{cache}} \leftarrow \text{concat}(K_{\text{past}}, k_t), \quad V_{\text{cache}} \leftarrow \text{concat}(V_{\text{past}}, v_t)
+$$
+
+Attention is calculated independently in each layer of the model, and caching is done on a per-layer basis.
+
+Refer to the table below to compare how caching improves efficiency.
+
+| without caching | with caching |
+|---|---|
+| for each step, recompute all previous `K` and `V`  | for each step, only compute current `K` and `V` 
+| attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) |
+
+
+
 ## Cache class

+A basic KV cache interface takes a key and value tensor for the current token and returns the updated `K` and `V` tensors. This is internally managed by a model's `forward` method.
+
+```py
+new_K, new_V = cache.update(k_t, v_t, layer_idx)
+attn_output = attn_layer_idx_fn(q_t, new_K, new_V)
+```
+
 When you use Transformers' [`Cache`] class, the self-attention module performs several critical steps to integrate past and present information.

 1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input.
@ -39,6 +80,27 @@ When you use Transformers' [`Cache`] class, the self-attention module performs s

 3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10])`.

+## Cache storage implementation
+
+The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].
+
+
+In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
+- `key_cache`: A list of tensors, one for each layer.
+- `value_cache`: A list of tensors, one for each layer.
+
+When new tokens are processed:
+
+1. For each layer, the new key and value states are concatenated with the existing cache.
+```py
+self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+```
+
+2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
+
+3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
+
 The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.

 ```py
@ -72,10 +134,14 @@ for _ in range(max_new_tokens):
 print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
 "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
 ```
-
 ## Legacy cache format

-Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format has is dynamic because it grows as text is generated, similar to [`DynamicCache`].
+Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
+
+The legacy format is essentially the same data structure but organized differently.
+- It's a tuple of tuples, where each inner tuple contains the key and value tensors for a layer.
+- The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
+- The format is less flexible and doesn't support features like quantization or offloading.

 If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.

--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,22 +25,28 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers-cli
+## transformers CLI

-Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.
+After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
-transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+transformers chat Qwen/Qwen2.5-0.5B-Instruct
 ```

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
 </div>

+You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
+
+```bash
+transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
+```
+
 For a full list of options, run the command below.

 ```bash
-transformers-cli chat -h
+transformers chat -h
 ```

 The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
@ -76,16 +82,16 @@ print(response[0]["generated_text"][-1]["content"])
 (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
 some wild stuff, like that Warhol guy's soup cans and all that jazz.

-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@ -107,9 +113,9 @@ print(response[0]["generated_text"][-1]["content"])
 ```

 ```txt
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
 (sarcastically) Oh, yeah, real original, Andy.

 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -20,11 +20,15 @@ A decoding strategy informs how a model should select the next generated token.

 This guide will help you understand the different decoding strategies available in Transformers and how and when to use them.

-## Greedy search
+## Basic decoding methods

-Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 tokens.
+These are well established decoding methods, and should be your starting point for text generation tasks.

-Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself.
+### Greedy search
+
+Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 new tokens.
+
+Greedy search works well for tasks with relatively short outputs where creativity is not a priority. However, it breaks down when generating longer sequences because it begins to repeat itself.

 ```py
 import torch
@ -40,11 +44,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing'
 ```

-## Contrastive search
+### Sampling

-[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
+Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire model's vocabulary (as opposed to the most likely token, as in greedy search). This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.

-Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
+Enable multinomial sampling with `do_sample=True` and `num_beams=1`.

 ```py
 import torch
@ -55,14 +59,14 @@ inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt"

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
 # explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
+'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
 ```

-## Beam search
+### Beam search

-Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability.
+Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability. It is best suited for input-grounded tasks, like describing an image or speech recognition. You can also use `do_sample=True` with beam search to sample at each step, but beam search will still greedily prune out low probability sequences between steps.

 > [!TIP]
 > Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works.
@ -83,66 +87,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 "['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']"
 ```

-## Diverse beam search
+## Advanced decoding methods

-[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
+Advanced decoding methods aim at either tackling specific generation quality issues (e.g. repetition) or at improving the generation throughput in certain situations. These techniques are more complex, and may not work correctly with all models.

-Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
-```
-
-## Multinomial sampling
-
-Search methods selects the most likely tokens. Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire models vocabulary. This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.
-
-Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
-```
-
-## Beam search multinomial sampling
-
-This decoding strategy is a combination of beam search and multinomial sampling. It generates multiple beams and uses a sampling strategy for each beam.
-
-Enable beam search multinomial sampling by setting `num_beams` to a value greater than 1 and `do_sample=True`.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=4)
-'Hugging Face is an open-source company 100% dedicated to making AI more accessible. We believe that AI should be available to everyone, and we’re working hard to make that a reality.\nWe’re a team of passionate engineers, designers,'
-```
-
-## Speculative decoding
+### Speculative decoding

 [Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more.

@ -203,7 +152,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 </hfoption>
 </hfoptions>

-### Prompt lookup decoding
+#### Prompt lookup decoding

 [Prompt lookup decoding](./llm_optims#prompt-lookup-decoding) is a variant of speculative decoding that uses overlapping n-grams as the candidate tokens. It works well for input-grounded tasks such as summarization. Refer to the [prompt lookup decoding](./llm_optims#prompt-lookup-decoding) guide to learn more.

@ -245,7 +194,7 @@ outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ```

-### Universal assisted decoding
+#### Universal assisted decoding

 Universal assisted decoding (UAD) enables the main and assistant models to use different tokenizers. The main models input tokens are re-encoded into assistant model tokens. Candidate tokens are generated in the assistant encoding which are re-encoded into the main model candidate tokens. The candidate tokens are verified as explained in [speculative decoding](#speculative-decoding).

@ -269,7 +218,27 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

-## DoLa
+### Contrastive search
+
+[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
+
+Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
+```
+
+### DoLa

 [Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit differences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2.

@ -325,6 +294,209 @@ tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tok
 </hfoption>
 </hfoptions>

+### Diverse beam search
+
+[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
+
+Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
+```
+
+
+## Custom decoding methods
+
+Custom decoding methods enable specialized generation behavior such as the following:
+- have the model continue thinking if it is uncertain;
+- roll back generation if the model gets stuck;
+- handle special tokens with custom logic;
+- enhanced input preparation for advanced models;
+
+We enable custom decoding methods through model repositories, assuming a specific model tag and file structure (see subsection below). This feature is an extension of [custom modeling code](./models.md#custom-models) and, like such, requires setting `trust_remote_code=True`.
+
+If a model repository holds a custom decoding method, the easiest way to try it out is to load the model and generate with it:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# `transformers-community/custom_generate_example` holds a copy of `Qwen/Qwen2.5-0.5B-Instruct`, but
+# with custom generation code -> calling `generate` uses the custom decoding method!
+tokenizer = AutoTokenizer.from_pretrained("transformers-community/custom_generate_example")
+model = AutoModelForCausalLM.from_pretrained(
+    "transformers-community/custom_generate_example", device_map="auto", trust_remote_code=True
+)
+
+inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
+# The custom decoding method is a minimal greedy decoding implementation. It also prints a custom message at run time.
+gen_out = model.generate(**inputs)
+# you should now see its custom message, "✨ using a custom generation method ✨"
+print(tokenizer.batch_decode(gen_out, skip_special_tokens=True))
+'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
+```
+
+Model repositories with custom decoding methods have a special property: their decoding method can be loaded from **any** model through [`~GenerationMixin.generate`]'s `custom_generate` argument. This means anyone can create and share their custom generation method to potentially work with any Transformers model, without requiring users to install additional Python packages.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
+
+inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
+# `custom_generate` replaces the original `generate` by the custom decoding method defined in
+# `transformers-community/custom_generate_example`
+gen_out = model.generate(**inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True)
+print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
+```
+
+You should read the `README.md` file of the repository containing the custom generation strategy to see what the new arguments and output type differences are, if they exist. Otherwise, you can assume it works like the base [`~GenerationMixin.generate`] method.
+
+> [!TIP]
+> You can find all custom decoding methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`
+
+Consider the Hub repository [transformers-community/custom_generate_example](https://huggingface.co/transformers-community/custom_generate_example) as an example. The `README.md` states that it has an additional input argument, `left_padding`, which adds a number of padding tokens before the prompt.
+
+```py
+gen_out = model.generate(
+    **inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True, left_padding=5
+)
+print(tokenizer.batch_decode(gen_out)[0])
+'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>The quick brown fox jumps over the lazy dog.\n\nThe sentence "The quick'
+```
+
+If the custom method has pinned Python requirements that your environment doesn't meet, you'll get an exception about missing requirements. For instance, [transformers-community/custom_generate_bad_requirements](https://huggingface.co/transformers-community/custom_generate_bad_requirements) has an impossible set of requirements defined in its `custom_generate/requirements.txt` file, and you'll see the error message below if you try to run it.
+
+```
+ImportError: Missing requirements in your local environment for `transformers-community/custom_generate_bad_requirements`:
+foo (installed: None)
+bar==0.0.0 (installed: None)
+torch>=99.0 (installed: 2.6.0)
+```
+
+Updating your Python requirements accordingly will remove this error message.
+
+### Creating a custom decoding method
+
+To create a new decoding method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
+1. The model you've designed your decoding method with.
+2. `custom_generate/generate.py`, which contains all the logic for your custom decoding method.
+3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method.
+4. `README.md`, where you should add the `custom_generate` tag and document any new arguments or output type differences of your custom method here.
+
+After you've added all required files, your repository should look like this
+
+```
+your_repo/
+├── README.md          # include the 'custom_generate' tag
+├── config.json
+├── ...
+└── custom_generate/
+    ├── generate.py
+    └── requirements.txt
+```
+
+#### Adding the base model
+
+The starting point for your custom decoding method is a model repository just like any other. The model to add to this repository should be the model you've designed your method with, and it is meant to be part of a working self-contained model-generate pair. When the model in this repository is loaded, your custom decoding method will override `generate`. Don't worry -- your decoding method can still be loaded with any other Transformers model, as explained in the section above.
+
+If you simply want to copy an existing model, you can do
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("source/model_repo")
+model = AutoModelForCausalLM.from_pretrained("source/model_repo")
+tokenizer.save_pretrained("your/decoding_method", push_to_hub=True)
+model.save_pretrained("your/decoding_method", push_to_hub=True)
+```
+
+#### generate.py
+
+This is the core of your decoding method. It *must* contain a method named `generate`, and this method *must* contain a `model` argument as its first argument. `model` is the model instance, which means you have access to all attributes and methods in the model, including the ones defined in [`GenerationMixin`] (like the base `generate` method).
+
+> [!WARNING]
+> `generate.py` must be placed in a folder named `custom_generate`, and not at the root level of the repository. The file paths for this feature are hardcoded.
+
+Under the hood, when the base [`~GenerationMixin.generate`] method is called with a `custom_generate` argument, it first checks its Python requirements (if any), then locates the custom `generate` method in `generate.py`, and finally calls the custom `generate`. All received arguments and `model` are forwarded to your custom `generate` method, with the exception of the arguments used to trigger the custom generation (`trust_remote_code` and `custom_generate`).
+
+This means your `generate` can have a mix of original and custom arguments (as well as a different output type) as shown below.
+
+```py
+import torch
+
+def generate(model, input_ids, generation_config=None, left_padding=None, **kwargs):
+    generation_config = generation_config or model.generation_config  # default to the model generation config
+    cur_length = input_ids.shape[1]
+    max_length = generation_config.max_length or cur_length + generation_config.max_new_tokens
+
+    # Example of custom argument: add `left_padding` (integer) pad tokens before the prompt
+    if left_padding is not None:
+        if not isinstance(left_padding, int) or left_padding < 0:
+            raise ValueError(f"left_padding must be an integer larger than 0, but is {left_padding}")
+
+        pad_token = kwargs.pop("pad_token", None) or generation_config.pad_token_id or model.config.pad_token_id
+        if pad_token is None:
+            raise ValueError("pad_token is not defined")
+        batch_size = input_ids.shape[0]
+        pad_tensor = torch.full(size=(batch_size, left_padding), fill_value=pad_token).to(input_ids.device)
+        input_ids = torch.cat((pad_tensor, input_ids), dim=1)
+        cur_length = input_ids.shape[1]
+
+    # Simple greedy decoding loop
+    while cur_length < max_length:
+        logits = model(input_ids).logits
+        next_token_logits = logits[:, -1, :]
+        next_tokens = torch.argmax(next_token_logits, dim=-1)
+        input_ids = torch.cat((input_ids, next_tokens[:, None]), dim=-1)
+        cur_length += 1
+
+    return input_ids
+```
+
+Follow the recommended practices below to ensure your custom decoding method works as expected.
+- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
+- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
+- You can add other files in the `custom_generate` folder, and use relative imports.
+- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
+
+#### requirements.txt
+
+You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
+
+#### README.md
+
+The root level `README.md` in the model repository usually describes the model therein. However, since the focus of the repository is the custom decoding method, we highly recommend to shift its focus towards describing the custom decoding method. In addition to a description of the method, we recommend documenting any input and/or output differences to the original [`~GenerationMixin.generate`]. This way, users can focus on what's new, and rely on Transformers docs for generic implementation details.
+
+For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository!
+
+```
+---
+library_name: transformers
+tags:
+  - custom_generate
+---
+
+(your markdown content here)
+```
+
+Recommended practices:
+- Document input and output differences in [`~GenerationMixin.generate`].
+- Add self-contained examples to enable quick experimentation.
+- Describe soft-requirements such as if the method only works well with a certain family of models.
+
+
 ## Resources

 Read the [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) blog post for an explanation of how common decoding strategies work.
--- a/docs/source/en/gpu_selection.md
+++ b/docs/source/en/gpu_selection.md
@ -1,94 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# GPU selection
-
-During distributed training, you can specify the number of GPUs to use and in what order. This can be useful when you have GPUs with different computing power and you want to use the faster GPU first. Or you could only use a subset of the available GPUs. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
-
-This guide will show you how to select the number of GPUs to use and the order to use them in.
-
-## Number of GPUs
-
-For example, if there are 4 GPUs and you only want to use the first 2, run the command below.
-
-<hfoptions id="select-gpu">
-<hfoption id="torchrun">
-
-Use the `--nproc_per_node` to select how many GPUs to use.
-
-```bash
-torchrun --nproc_per_node=2  trainer-program.py ...
-```
-
-</hfoption>
-<hfoption id="Accelerate">
-
-Use `--num_processes` to select how many GPUs to use.
-
-```bash
-accelerate launch --num_processes 2 trainer-program.py ...
-```
-
-</hfoption>
-<hfoption id="DeepSpeed">
-
-Use `--num_gpus` to select how many GPUs to use.
-
-```bash
-deepspeed --num_gpus 2 trainer-program.py ...
-```
-
-</hfoption>
-</hfoptions>
-
-### Order of GPUs
-
-To select specific GPUs to use and their order, configure the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if there are 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
-```
-
-Only the 2 physical GPUs (0 and 2) are "visible" to PyTorch and these are mapped to `cuda:0` and `cuda:1` respectively. You can also reverse the order of the GPUs to use 2 first. The mapping becomes `cuda:1` for GPU 0 and `cuda:0` for GPU 2.
-
-```bash
-CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
-```
-
-You can also set the `CUDA_VISIBLE_DEVICES` environment variable to an empty value to create an environment without GPUs.
-
-```bash
-CUDA_VISIBLE_DEVICES= python trainer-program.py ...
-```
-
-> [!WARNING]
-> As with any environment variable, they can be exported instead of being added to the command line. However, this is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong GPUs. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
-
-`CUDA_DEVICE_ORDER` is an alternative environment variable you can use to control how the GPUs are ordered. You can order according to the following.
-
-1. PCIe bus IDs that matches the order of [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/.doxygen/docBin/html/index.html) for NVIDIA and AMD GPUs respectively.
-
-```bash
-export CUDA_DEVICE_ORDER=PCI_BUS_ID
-```
-
-2. GPU compute ability.
-
-```bash
-export CUDA_DEVICE_ORDER=FASTEST_FIRST
-```
-
-The `CUDA_DEVICE_ORDER` is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. In this case, set `CUDA_DEVICE_ORDER=FASTEST_FIRST` to always use the newer and faster GPU first (`nvidia-smi` or `rocm-smi` still reports the GPUs in their PCIe order). Or you could also set `export CUDA_VISIBLE_DEVICES=1,0`.
--- a/docs/source/en/how_to_hack_models.md
+++ b/docs/source/en/how_to_hack_models.md
@ -90,11 +90,6 @@ class SamVisionAttentionSplit(SamVisionAttention, nn.Module):

        attn_weights = (query * self.scale) @ key.transpose(-2, -1)

-        if self.use_rel_pos:
-            attn_weights = self.add_decomposed_rel_pos(
-                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
-            )
-
        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
@ -114,13 +109,14 @@ Load the model with [`~PreTrainedModel.from_pretrained`].

 ```py
 from transformers import SamModel
-from transformers.models.sam import modeling_sam
-
-# replace the attention class in the modeling_sam module
-modeling_sam.SamVisionAttention = SamVisionAttentionSplit

 # load the pretrained SAM model
 model = SamModel.from_pretrained("facebook/sam-vit-base")
+
+# replace the attention class in the vision_encoder module
+for layer in model.vision_encoder.layers:
+    if hasattr(layer, "attn"):
+        layer.attn = SamVisionAttentionSplit(model.config.vision_config, model.config.vision_config.window_size)
 ```

 ## LoRA
@ -138,7 +134,7 @@ config = LoraConfig(
    # apply LoRA to q and v
    target_modules=["q", "v"],
    lora_dropout=0.1,
-    task_type="mask-generation"
+    task_type="FEATURE_EXTRACTION"
 )
 ```

@ -152,5 +148,5 @@ Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_refer

 ```py
 model.print_trainable_parameters()
-"trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447"
+"trainable params: 589,824 || all params: 94,274,096 || trainable%: 0.6256"
 ```
--- a/docs/source/en/hpo_train.md
+++ b/docs/source/en/hpo_train.md
@ -19,6 +19,9 @@ Hyperparameter search discovers an optimal set of hyperparameters that produces

 This guide will go over how to set up a hyperparameter search for each of the backends.

+> [!WARNING]
+> [SigOpt](https://github.com/sigopt/sigopt-server) is in public archive mode and is no longer actively maintained. Try using Optuna, Weights & Biases or Ray Tune instead.
+
 ```bash
 pip install optuna/sigopt/wandb/ray[tune]
 ```
--- a/docs/source/en/image_processors.md
+++ b/docs/source/en/image_processors.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Image processors

-Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
+Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.

 - [`~BaseImageProcessor.center_crop`] to resize an image
 - [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -380,11 +380,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 [[autodoc]] HQQQuantizedCache

-[[autodoc]] SinkCache
-    - update
-    - get_seq_length
-    - reorder_cache
-
 [[autodoc]] OffloadedCache
    - update
    - prefetch_layer
@ -443,4 +438,3 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 [[autodoc]] CompileConfig
    - __call__
-
--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@ -84,6 +84,19 @@ class Trainer:

 Backends that can be added here are all the backends that are available in the `import_utils.py` module.

+Additionally, specific versions can be specified in each backend. For example, this is how you would specify
+a requirement on torch>=2.6 on the `Trainer` class:
+
+```python
+from .utils.import_utils import requires
+
+@requires(backends=("torch>=2.6", "accelerate"))
+class Trainer:
+    ...
+```
+
+You can specify the following operators: `==`, `>`, `>=`, `<`, `<=`, `!=`.
+
 ## Methods

 [[autodoc]] utils.import_utils.define_import_structure
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -16,7 +16,8 @@ rendered properly in your Markdown viewer.

 # Model debugging toolboxes

-This page lists all the debugging and model adding tools used by the library, as well as the utility functions it provides for it.
+This page lists all the debugging and model adding tools used by the library, as well as the utility functions it
+provides for it.

 Most of those are only useful if you are adding new models in the library.

@ -26,13 +27,14 @@ Most of those are only useful if you are adding new models in the library.

 ### Model addition debugger - context manager for model adders

-This context manager is a power user tool intended for model adders.
-It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
-To note, this context manager enforces `torch.no_grad()`.
+This context manager is a power user tool intended for model adders. It tracks all forward calls within a model forward
+and logs a slice of each input and output on a nested JSON. To note, this context manager enforces `torch.no_grad()`.

 ### Rationale

-Because when porting models to transformers, even from python to python, model adders often have to do a lot of manual operations, involving saving and loading tensors, comparing dtypes, etc. This small tool can hopefully shave off some time.
+When porting models to transformers, even from python to python, model adders often have to do a lot of manual
+operations, involving saving and loading tensors, comparing dtypes, etc. This small tool can hopefully shave off some
+time.

 ### Usage

@ -62,10 +64,10 @@ inputs = processor(text=prompt, images=random_image, return_tensors="pt")

 # call forward method (not .generate!)
 with model_addition_debugger_context(
-  model,
-  debug_path="optional_path_to_your_directory",
-  do_prune_layers=False # This will output ALL the layers of a model.
-  ):
+    model,
+    debug_path="optional_path_to_your_directory",
+    do_prune_layers=False # This will output ALL the layers of a model.
+):
    output = model.forward(**inputs)

 ```
@ -73,8 +75,8 @@ with model_addition_debugger_context(

 ### Reading results

-The debugger generates two files from the forward call, both with the same base name, 
-but ending either with `_SUMMARY.json` or with `_FULL_TENSORS.json`. 
+The debugger generates two files from the forward call, both with the same base name, but ending either with
+`_SUMMARY.json` or with `_FULL_TENSORS.json`.

 The first one will contain a summary of each module's _input_ and _output_ tensor values and shapes.

@ -142,8 +144,8 @@ The first one will contain a summary of each module's _input_ and _output_ tenso
        { ... and so on
 ```

-The `_FULL_TENSORS.json` file will display a full view of all tensors, which is useful
-for comparing two files. 
+The `_FULL_TENSORS.json` file will display a full view of all tensors, which is useful for comparing two files.
+
 ```json
      "pixel_values": {
        "shape": "torch.Size([1, 5, 576, 588])",
@ -196,9 +198,38 @@ for comparing two files.
      },
 ```

+#### Saving tensors to disk
+
+Some model adders may benefit from logging full tensor values to disk to support, for example, numerical analysis
+across implementations.
+
+Set `use_repr=False` to write tensors to disk using [SafeTensors](https://huggingface.co/docs/safetensors/en/index).
+
+```python
+with model_addition_debugger_context(
+    model,
+    debug_path="optional_path_to_your_directory",
+    do_prune_layers=False,
+    use_repr=False,   # Defaults to True
+):
+    output = model.forward(**inputs)
+```
+
+When using `use_repr=False`, tensors are written to the same disk location as the `_SUMMARY.json` and
+`_FULL_TENSORS.json` files. The `value` property of entries in the `_FULL_TENSORS.json` file will contain a relative
+path reference to the associated `.safetensors` file. Each tensor is written to its own file as the `data` property of
+the state dictionary. File names are constructed using the `module_path` as a prefix with a few possible postfixes that
+are built recursively.
+
+*   Module inputs are denoted with the `_inputs` and outputs by `_outputs`.
+*   `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`.
+*   `dict` instances will be postfixed with `_{key}`.
+
 ### Comparing between implementations

-Once the forward passes of two models have been traced by the debugger, one can compare the `json` output files. See below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong. 
+Once the forward passes of two models have been traced by the debugger, one can compare the `json` output files. See
+below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly
+identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong.


 ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/files_difference_debugging.png)
@ -206,8 +237,13 @@ Once the forward passes of two models have been traced by the debugger, one can

 ### Limitations and scope

-This feature will only work for torch-based models, and would require more work and case-by-case approach for say `jax`-based models that are usually compiled. Models relying heavily on external kernel calls may work, but trace will probably miss some things. Regardless, any python implementation that aims at mimicking another implementation can be traced once instead of reran N times with breakpoints.
+This feature will only work for torch-based models, and would require more work and case-by-case approach for say
+`jax`-based models that are usually compiled. Models relying heavily on external kernel calls may work, but trace will
+probably miss some things. Regardless, any python implementation that aims at mimicking another implementation can be
+traced once instead of reran N times with breakpoints.

-If you pass `do_prune_layers=False` to your model debugger, ALL the layers will be outputted to `json`. Else, only the first and last layer will be shown. This is useful when some layers (typically cross-attention) appear only after N layers. 
+If you pass `do_prune_layers=False` to your model debugger, ALL the layers will be outputted to `json`. Else, only the
+first and last layer will be shown. This is useful when some layers (typically cross-attention) appear only after N
+layers.

 [[autodoc]] model_addition_debugger_context
--- a/docs/source/en/internal/modeling_utils.md
+++ b/docs/source/en/internal/modeling_utils.md
@ -29,6 +29,11 @@ Most of those are only useful if you are studying the code of the models in the
 [[autodoc]] AttentionInterface
    - register

+## Attention Mask Functions
+
+[[autodoc]] AttentionMaskInterface
+    - register
+
 ## Rotary Position Embedding Functions

 [[autodoc]] dynamic_rope_update
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -30,7 +30,6 @@ Transformers offers several [`Cache`] classes that implement different caching m
 | Offloaded Static Cache  | No               | Yes                      | Yes                        | High    | Yes                     |
 | Quantized Cache        | Yes              | No                       | No                         | Low     | Yes                     |
 | Sliding Window Cache   | No               | Yes                      | Yes                        | High    | No                      |
-| Sink Cache             | Yes              | No                       | Yes                        | Mid     | Yes                     |

 This guide introduces you to the different [`Cache`] classes and shows you how to use them for generation.

@ -174,28 +173,6 @@ I like rock music because it's loud and energetic. It's a great way to express m
 </hfoption>
 </hfoptions>

-### Sink cache
-
-[`SinkCache`] is capable of generating very long sequences ("infinite length" according to the paper) by only retaining a few initial tokens from the sequence. These are called the *sink tokens* because they account for a significant portion of the attention scores during generation. Subsequent tokens are discarded on a sliding windowed basis, and only the latest `window_size` tokens are kept. This means most of the previous knowledge is discarded.
-
-The sink tokens allow a model to maintain stable performance even when it's dealing with very long text sequences.
-
-Enable [`SinkCache`] by initializing it first with the [window_length](https://hf.co/docs/transformers/main/en/internal/generation_utils#transformers.SinkCache.window_length) and [num_sink_tokens](https://hf.co/docs/transformers/main/en/internal/generation_utils#transformers.SinkCache.num_sink_tokens) parameters before passing it to [past_key_values](https://hf.co/docs/transformers/internal/generation_utils#transformers.generation.GenerateDecoderOnlyOutput.past_key_values) in [`~GenerationMixin.generate`].
-
-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
-inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)
-
-past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
-out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
-tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily"
-```
-
 ## Speed optimized caches

 The default [`DynamicCache`] prevents you from taking advantage of just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation.
@ -247,7 +224,7 @@ Enable [`SlidingWindowCache`] by configuring `cache_implementation="sliding_wind

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
+from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
 model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
@ -284,8 +261,6 @@ A cache can also work in iterative generation settings where there is back-and-f

 For iterative generation with a cache, start by initializing an empty cache class and then you can feed in your new prompts. Keep track of dialogue history with a [chat template](./chat_templating).

-If you're using [`SinkCache`], the inputs need to be truncated to the maximum length because [`SinkCache`] can generate text that exceeds its maximum window size. However, the first input shouldn't exceed the maximum cache length.
-
 The example below demonstrates how to use a cache for iterative generation.

 ```py
@ -293,7 +268,6 @@ import torch
 from transformers import AutoTokenizer,AutoModelForCausalLM
 from transformers.cache_utils import (
    DynamicCache,
-    SinkCache,
    StaticCache,
    SlidingWindowCache,
    QuantoQuantizedCache,
@ -313,8 +287,6 @@ messages = []
 for prompt in user_prompts:
    messages.append({"role": "user", "content": prompt})
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
-    if isinstance(past_key_values, SinkCache):
-        inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()}
    input_length = inputs["input_ids"].shape[1]
    outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values)
    completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True)
@ -336,7 +308,7 @@ model_id = "meta-llama/Llama-2-7b-chat-hf"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
 tokenizer = AutoTokenizer.from_pretrained(model_id)

-# Init StaticCache with big enough max-length (1024 tokens for the below example) 
+# Init StaticCache with big enough max-length (1024 tokens for the below example)
 # You can also init a DynamicCache, if that suits you better
 prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)

@ -351,7 +323,7 @@ responses = []
 for prompt in prompts:
    new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
    past_key_values = copy.deepcopy(prompt_cache)
-    outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20) 
+    outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
    response = tokenizer.batch_decode(outputs)[0]
    responses.append(response)

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -20,9 +20,13 @@ rendered properly in your Markdown viewer.

 Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token.

-In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities.
+In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.

-This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
+> [!TIP]
+> You can also chat with a model directly from the command line. ([reference](./conversations.md#transformers-cli))
+> ```shell
+> transformers chat Qwen/Qwen2.5-0.5B-Instruct
+> ```

 ## Default generate

@ -80,14 +84,17 @@ GenerationConfig {
 }
 ```

-You can customize [`~GenerationMixin.generate`] by overriding the parameters and values in [`GenerationConfig`]. Some of the most commonly adjusted parameters are [max_new_tokens](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.max_new_tokens), [num_beams](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.num_beams), [do_sample](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.do_sample), and [num_return_sequences](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.num_return_sequences).
+You can customize [`~GenerationMixin.generate`] by overriding the parameters and values in [`GenerationConfig`]. See [this section below](#common-options) for commonly adjusted parameters.

 ```py
 # enable beam search sampling strategy
 model.generate(**inputs, num_beams=4, do_sample=True)
 ```

-[`~GenerationMixin.generate`] can also be extended with external libraries or custom code. The `logits_processor` parameter accepts custom [`LogitsProcessor`] instances for manipulating the next token probability distribution. `stopping_criteria` supports custom [`StoppingCriteria`] to stop text generation. Check out the [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo) for more examples of external [`~GenerationMixin.generate`]-compatible extensions.
+[`~GenerationMixin.generate`] can also be extended with external libraries or custom code:
+1. the `logits_processor` parameter accepts custom [`LogitsProcessor`] instances for manipulating the next token probability distribution;
+2. the `stopping_criteria` parameters supports custom [`StoppingCriteria`] to stop text generation;
+3. other custom generation methods can be loaded through the `custom_generate` flag ([docs](generation_strategies.md/#custom-decoding-methods)).

 Refer to the [Generation strategies](./generation_strategies) guide to learn more about search, sampling, and decoding strategies.

@ -134,6 +141,20 @@ outputs = model.generate(**inputs, generation_config=generation_config)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ```

+## Common Options
+
+[`~GenerationMixin.generate`] is a powerful tool that can be heavily customized. This can be daunting for a new users. This section contains a list of popular generation options that you can define in most text generation tools in Transformers: [`~GenerationMixin.generate`], [`GenerationConfig`], `pipelines`, the `chat` CLI, ...
+
+| Option name | Type | Simplified description |
+|---|---|---|
+| `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
+| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
+| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
+| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
+| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
+| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
+
+
 ## Pitfalls

 The section below covers some common issues you may encounter during text generation and how to solve them.
@ -286,4 +307,4 @@ Take a look below for some more specific and specialized text generation librari
 - [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python).
 - [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs.
 - [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation.
- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
+- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -0,0 +1,55 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Video Processor
+
+A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
+
+The video processor extends the functionality of image processors by allowing Vision Large Language Models (VLMs) to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
+
+When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't updated your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
+
+
+### Usage Example
+Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
+
+```python
+from transformers import AutoVideoProcessor
+
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+```
+
+Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
+
+Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
+
+```python
+import torch
+from transformers.video_utils import load_video
+from transformers import AutoVideoProcessor
+
+video = load_video("video.mp4")
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
+processor = torch.compile(processor)
+processed_video = processor(video, return_tensors="pt")
+```
+
+
+## BaseVideoProcessor
+
+[[autodoc]] video_processing_utils.BaseVideoProcessor
+
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@ -57,6 +57,7 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 - Embedding size E is different from hidden size H justified because the embeddings are context independent (one embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V being the vocab size). If E < H, it has less parameters.
 - Layers are split in groups that share parameters (to save memory).
 Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/align.md
+++ b/docs/source/en/model_doc/align.md
@ -13,65 +13,141 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="Transformers" src="https://img.shields.io/badge/Transformers-6B5B95?style=flat&logo=transformers&logoColor=white">
+  </div>
+</div>

 # ALIGN

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[ALIGN](https://huggingface.co/papers/2102.05918) is pretrained on a noisy 1.8 billion alt‑text and image pair dataset to show that scale can make up for the noise. It uses a dual‑encoder architecture, [EfficientNet](./efficientnet) for images and [BERT](./bert) for text, and a contrastive loss to align similar image–text embeddings together while pushing different embeddings apart. Once trained, ALIGN can encode any image and candidate captions into a shared vector space for zero‑shot retrieval or classification without requiring extra labels. This scale‑first approach reduces dataset curation costs and powers state‑of‑the‑art image–text retrieval and zero‑shot ImageNet classification.

-## Overview
+You can find all the original ALIGN checkpoints under the [Kakao Brain](https://huggingface.co/kakaobrain?search_models=align) organization.

-The ALIGN model was proposed in [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. ALIGN is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image classification. ALIGN features a dual-encoder architecture with [EfficientNet](efficientnet) as its vision encoder and [BERT](bert) as its text encoder, and learns to align visual and text representations with contrastive learning. Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe.
+> [!TIP]
+> Click on the ALIGN models in the right sidebar for more examples of how to apply ALIGN to different vision and text related tasks.

-The abstract from the paper is the following:
+The example below demonstrates zero-shot image classification with [`Pipeline`] or the [`AutoModel`] class.

-*Pre-trained representations are becoming crucial for many NLP and perception tasks. While representation learning in NLP has transitioned to training on raw text without human annotations, visual and vision-language representations still rely heavily on curated training datasets that are expensive or require expert knowledge. For vision applications, representations are mostly learned using datasets with explicit class labels such as ImageNet or OpenImages. For vision-language, popular datasets like Conceptual Captions, MSCOCO, or CLIP all involve a non-trivial data collection (and cleaning) process. This costly curation process limits the size of datasets and hence hinders the scaling of trained models. In this paper, we leverage a noisy dataset of over one billion image alt-text pairs, obtained without expensive filtering or post-processing steps in the Conceptual Captions dataset. A simple dual-encoder architecture learns to align visual and language representations of the image and text pairs using a contrastive loss. We show that the scale of our corpus can make up for its noise and leads to state-of-the-art representations even with such a simple learning scheme. Our visual representation achieves strong performance when transferred to classification tasks such as ImageNet and VTAB. The aligned visual and language representations enables zero-shot image classification and also set new state-of-the-art results on Flickr30K and MSCOCO image-text retrieval benchmarks, even when compared with more sophisticated cross-attention models. The representations also enable cross-modality search with complex text and text + image queries.*
+<hfoptions id="usage">  

-This model was contributed by [Alara Dirik](https://huggingface.co/adirik).
-The original code is not released, this implementation is based on the Kakao Brain implementation based on the original paper.
+<hfoption id="Pipeline">

-## Usage example
-
-ALIGN uses EfficientNet to get visual features and BERT to get the text features. Both the text and visual features are then projected to a latent space with identical dimension. The dot product between the projected image and text features is then used as a similarity score.
-
-[`AlignProcessor`] wraps [`EfficientNetImageProcessor`] and [`BertTokenizer`] into a single instance to both encode the text and preprocess the images. The following example shows how to get the image-text similarity scores using [`AlignProcessor`] and [`AlignModel`].
-
-```python
-import requests
+```py
 import torch
-from PIL import Image
-from transformers import AlignProcessor, AlignModel
+from transformers import pipeline

-processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
-model = AlignModel.from_pretrained("kakaobrain/align-base")
+pipeline = pipeline(
+    task="zero-shot-image-classification",
+    model="kakaobrain/align-base",
+    device=0,
+    torch_dtype=torch.bfloat16
+)

-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-candidate_labels = ["an image of a cat", "an image of a dog"]
+candidate_labels = [
+    "a photo of a dog",
+    "a photo of a cat",
+    "a photo of a person"
+]

-inputs = processor(images=image ,text=candidate_labels, return_tensors="pt")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# this is the image-text similarity score
-logits_per_image = outputs.logits_per_image
-
-# we can take the softmax to get the label probabilities
-probs = logits_per_image.softmax(dim=1)
-print(probs)
+pipeline("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", candidate_labels=candidate_labels)
 ```

+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
+
+processor = AutoProcessor.from_pretrained("kakaobrain/align-base")
+model = AutoModelForZeroShotImageClassification.from_pretrained("kakaobrain/align-base").to("cuda")
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = requests.get(url, stream=True)
+inputs = Image.open(image.raw).convert("RGB")
+
+image_inputs = processor(images=inputs, return_tensors="pt").to("cuda")
+with torch.no_grad():
+    image_embeds = model.get_image_features(**image_inputs)
+
+candidate_labels = ["a photo of a dog", "a photo of a cat", "a photo of a person"]
+text_inputs = processor(text=candidate_labels, padding=True, return_tensors="pt").to("cuda")
+with torch.no_grad():
+    text_embeds = model.get_text_features(**text_inputs)
+
+image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+text_embeds  = text_embeds  / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+logits = (image_embeds @ text_embeds.T) * 100.0
+probs  = logits.softmax(dim=-1).cpu().squeeze()
+
+for label, score in zip(candidate_labels, probs):
+    print(f"{label:20s} → {score.item():.4f}")
+```
+
+</hfoption>
+
+</hfoptions>
+
+## Notes
+
+- ALIGN projects the text and visual features into latent space and the dot product between the projected image and text features is used as the similarity score. The example below demonstrates how to calculate the image-text similarity score with [`AlignProcessor`] and [`AlignModel`].
+
+  ```py
+  # Example of using ALIGN for image-text similarity
+  from transformers import AlignProcessor, AlignModel
+  import torch
+  from PIL import Image
+  import requests
+  from io import BytesIO
+  
+  # Load processor and model
+  processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
+  model = AlignModel.from_pretrained("kakaobrain/align-base")
+  
+  # Download image from URL
+  url = "https://huggingface.co/roschmid/dog-races/resolve/main/images/Golden_Retriever.jpg"
+  response = requests.get(url)
+  image = Image.open(BytesIO(response.content))  # Convert the downloaded bytes to a PIL Image
+  
+  texts = ["a photo of a cat", "a photo of a dog"]
+  
+  # Process image and text inputs
+  inputs = processor(images=image, text=texts, return_tensors="pt")
+  
+  # Get the embeddings
+  with torch.no_grad():
+      outputs = model(**inputs)
+  
+  image_embeds = outputs.image_embeds
+  text_embeds = outputs.text_embeds
+  
+  # Normalize embeddings for cosine similarity
+  image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True)
+  text_embeds = text_embeds / text_embeds.norm(dim=1, keepdim=True)
+  
+  # Calculate similarity scores
+  similarity_scores = torch.matmul(text_embeds, image_embeds.T)
+  
+  # Print raw scores
+  print("Similarity scores:", similarity_scores)
+  
+  # Convert to probabilities
+  probs = torch.nn.functional.softmax(similarity_scores, dim=0)
+  print("Probabilities:", probs)
+  
+  # Get the most similar text
+  most_similar_idx = similarity_scores.argmax().item()
+  print(f"Most similar text: '{texts[most_similar_idx]}'")
+  ```
+
 ## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN.
-
- A blog post on [ALIGN and the COYO-700M dataset](https://huggingface.co/blog/vit-align).
- A zero-shot image classification [demo](https://huggingface.co/spaces/adirik/ALIGN-zero-shot-image-classification).
- [Model card](https://huggingface.co/kakaobrain/align-base) of `kakaobrain/align-base` model.
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. The resource should ideally demonstrate something new instead of duplicating an existing resource.
+- Refer to the [Kakao Brain’s Open Source ViT, ALIGN, and the New COYO Text-Image Dataset](https://huggingface.co/blog/vit-align) blog post for more details.

 ## AlignConfig

--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -14,60 +14,71 @@ rendered properly in your Markdown viewer.

 -->

-# Aria
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Aria

-The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team.
+[Aria](https://huggingface.co/papers/2410.05993) is a multimodal mixture-of-experts (MoE) model. The goal of this model is to open-source a training recipe for creating a multimodal native model from scratch. Aria has 3.9B and 3.5B activated parameters per visual and text token respectively. Text is handled by a MoE decoder and visual inputs are handled by a lightweight visual encoder. It is trained in 4 stages, language pretraining, multimodal pretraining, multimodal long-context pretraining, and multimodal post-training.

-Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token. 
+You can find all the original Aria checkpoints under the [Aria](https://huggingface.co/rhymes-ai?search_models=aria) organization.

-The abstract from the paper is the following:
+> [!TIP]
+> Click on the Aria models in the right sidebar for more examples of how to apply Aria to different multimodal tasks.

-*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.*
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

-This model was contributed by [m-ric](https://huggingface.co/m-ric).
-The original code can be found [here](https://github.com/rhymes-ai/Aria).
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-## Usage tips
-
-Here's how to use the model for vision tasks:
 ```python
-import requests
 import torch
-from PIL import Image
+from transformers import pipeline

-from transformers import AriaProcessor, AriaForConditionalGeneration
+pipeline = pipeline(
+    "image-to-text",
+    model="rhymes-ai/Aria",
+    device=0,
+    torch_dtype=torch.bfloat16
+)
+pipeline(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+    text="What is shown in this image?"
+)
+```

-model_id_or_path = "rhymes-ai/Aria"
+</hfoption>
+<hfoption id="AutoModel">

-model = AriaForConditionalGeneration.from_pretrained(
-    model_id_or_path, device_map="auto"
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+
+model = AutoModelForCausalLM.from_pretrained(
+    "rhymes-ai/Aria",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="sdpa"
 )

-processor = AriaProcessor.from_pretrained(model_id_or_path)
-
-image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")

 messages = [
    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"text": "what is the image?", "type": "text"},
-        ],
-    }
+        "role": "user", "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ]
+    },
 ]

-text = processor.apply_chat_template(messages, add_generation_prompt=True)
-inputs = processor(text=text, images=image, return_tensors="pt")
-inputs.to(model.device)
+inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
+ipnuts = inputs.to(model.device, torch.bfloat16)

 output = model.generate(
    **inputs,
@ -79,6 +90,55 @@ output = model.generate(
 )
 output_ids = output[0][inputs["input_ids"].shape[1]:]
 response = processor.decode(output_ids, skip_special_tokens=True)
+print(response)
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+	
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4 and the [rhymes-ai/Aria-sequential_mlp](https://huggingface.co/rhymes-ai/Aria-sequential_mlp) checkpoint. This checkpoint replaces grouped GEMM with `torch.nn.Linear` layers for easier quantization.
+
+```py
+# pip install torchao
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoProcessor
+
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+model = AutoModelForCausalLM.from_pretrained(
+    "rhymes-ai/Aria-sequential_mlp",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+processor = AutoProcessor.from_pretrained(
+    "rhymes-ai/Aria-sequential_mlp",
+)
+
+messages = [
+    {
+        "role": "user", "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
+inputs = inputs.to(model.device, torch.bfloat16)
+
+output = model.generate(
+    **inputs,
+    max_new_tokens=15,
+    stop_strings=["<|im_end|>"],
+    tokenizer=processor.tokenizer,
+    do_sample=True,
+    temperature=0.9,
+)
+output_ids = output[0][inputs["input_ids"].shape[1]:]
+response = processor.decode(output_ids, skip_special_tokens=True)
+print(response)
 ```


@ -102,6 +162,10 @@ response = processor.decode(output_ids, skip_special_tokens=True)

 [[autodoc]] AriaTextModel

+## AriaModel
+
+[[autodoc]] AriaModel
+
 ## AriaTextForCausalLM

 [[autodoc]] AriaTextForCausalLM
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -74,6 +74,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its

 [[autodoc]] AutoImageProcessor

+## AutoVideoProcessor
+
+[[autodoc]] AutoVideoProcessor
+
 ## AutoProcessor

 [[autodoc]] AutoProcessor
@ -385,3 +389,9 @@ The following auto classes are available for the following multimodal tasks.
 ### AutoModelForImageTextToText

 [[autodoc]] AutoModelForImageTextToText
+
+## Time Series
+
+### AutoModelForTimeSeriesPrediction
+
+[[autodoc]] AutoModelForTimeSeriesPrediction
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -237,6 +237,10 @@ for i, output in enumerate(batch_outputs):

 [[autodoc]] AyaVisionConfig

+## AyaVisionModel
+
+[[autodoc]] AyaVisionModel
+
 ## AyaVisionForConditionalGeneration

 [[autodoc]] AyaVisionForConditionalGeneration
--- a/docs/source/en/model_doc/bamba.md
+++ b/docs/source/en/model_doc/bamba.md
@ -39,7 +39,7 @@ Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-mod
 <!---
 ## Usage Tips

-Tips: 
+Tips:

 - The architecture is based on Mamba-2 models.

@ -63,7 +63,35 @@ response = model.generate(**inputs, max_new_tokens=64)
 print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
 ```

+
+## Padding-Free Training
+
+Bamba supports padding-free training in which distinct training examples can be concatenated
+together while nevertheless processing the inputs as though they belonged to separate batches. When
+the examples are of varying lengths, padding-free training can provide significant speed ups and
+memory savings compared to batching the examples together and using padding, as the unnecessary
+compute and memory due to padding is avoided entirely. The performance gains depend on factors such
+as the model and the data distribution, but throughput gains up to [~2x are commonly
+seen](https://github.com/huggingface/transformers/pull/35861#issue-2807873129).
+
+Using padding-free training with Bamba requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d`
+packages, and the following arguments must be passed to the model in addition to `input_ids` and
+`labels`:
+* `position_ids: torch.LongTensor`: the position index of each token in each sequence.
+* `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
+* Each of the [`FlashAttentionKwargs`]
+    * `cu_seq_lens_q: torch.LongTensor`: The cumulative sequence lengths of all queries.
+    * `cu_seq_lens_k: torch.LongTensor`: The cumulative sequence lengths of all keys.
+    * `max_length_q: int`: the longest query length in the batch.
+    * `max_length_k: int`: the longest key length in the batch.
+
+The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] can be used
+to programmatically generate the above set of additional arguments using `return_seq_idx=True` and
+`return_flash_attn_kwargs=True`. See [this blog post](https://huggingface.co/blog/packing-with-FA2)
+for additional information.
+
+
 [[autodoc]] BambaForCausalLM
    - forward

-This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim). 
+This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@ -14,115 +14,87 @@ rendered properly in your Markdown viewer.

 -->

-# BART

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+    ">
+    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

-## Overview
+# BART
+[BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It’s pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language.

-The Bart model was proposed in [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
-Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
-Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+You can find all the original BART checkpoints under the [AI at Meta](https://huggingface.co/facebook?search_models=bart) organization.

-According to the abstract,
+The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.

- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
-  left-to-right decoder (like GPT).
- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
-  where spans of text are replaced with a single mask token.
- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
-  matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
-  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
-  of up to 6 ROUGE.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/bart).
+```py
+import torch
+from transformers import pipeline

-## Usage tips:
+pipeline = pipeline(
+    task="fill-mask",
+    model="facebook/bart-large",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create <mask> through a process known as photosynthesis.")

- BART is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
- Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of the following transformations are applied on the pretraining tasks for the encoder:
+```
+</hfoption>
+<hfoption id="AutoModel">

-  * mask random tokens (like in BERT)
-  * delete random tokens
-  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
-  * permute sentences
-  * rotate the document to make it start at a specific token
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer

-## Implementation Notes
+tokenizer = AutoTokenizer.from_pretrained(
+    "facebook/bart-large",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "facebook/bart-large",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to("cuda")

- Bart doesn't use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or
-  [`~BartTokenizer.encode`] to get the proper splitting.
- The forward pass of [`BartModel`] will create the `decoder_input_ids` if they are not passed.
-  This is different than some other modeling APIs. A typical use case of this feature is mask filling.
- Model predictions are intended to be identical to the original implementation when
-  `forced_bos_token_id=0`. This only works, however, if the string you pass to
-  [`fairseq.encode`] starts with a space.
- [`~generation.GenerationMixin.generate`] should be used for conditional generation tasks like
-  summarization, see the example in that docstrings.
- Models that load the *facebook/bart-large-cnn* weights will not have a `mask_token_id`, or be able to perform
-  mask-filling tasks.
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits

-## Mask Filling
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)

-The `facebook/bart-base` and `facebook/bart-large` checkpoints can be used to fill multi-token masks.
-
-```python
-from transformers import BartForConditionalGeneration, BartTokenizer
-
-model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
-tok = BartTokenizer.from_pretrained("facebook/bart-large")
-example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
-batch = tok(example_english_phrase, return_tensors="pt")
-generated_ids = model.generate(batch["input_ids"])
-assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
-    "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
-]
+print(f"The predicted token is: {predicted_token}")
 ```

-## Resources
+</hfoption>
+<hfoption id="transformers CLI">

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BART. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+```bash
+echo -e "Plants create <mask> through a process known as photosynthesis." | transformers-cli run --task fill-mask --model facebook/bart-large --device 0
+```

-<PipelineTag pipeline="summarization"/>
+</hfoption>
+</hfoptions>

- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq).
- A notebook on how to [finetune BART for summarization with fastai using blurr](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb). 🌎
- A notebook on how to [finetune BART for summarization in two languages with Trainer class](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb). 🌎
- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb).
- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
- [`FlaxBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization).
- An example of how to train [`BartForConditionalGeneration`] with a Hugging Face `datasets` object can be found in this [forum discussion](https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904)
- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course.
- [Summarization task guide](../tasks/summarization)
+## Notes

-<PipelineTag pipeline="fill-mask"/>
-
- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
- [`FlaxBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
- [Masked language modeling task guide](../tasks/masked_language_modeling)
-
-<PipelineTag pipeline="translation"/>
-
- A notebook on how to [finetune mBART using Seq2SeqTrainer for Hindi to English translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb). 🌎
- [`BartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb).
- [`TFBartForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
- [Translation task guide](../tasks/translation)
-
-See also:
- [Text classification task guide](../tasks/sequence_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Distilled checkpoints](https://huggingface.co/models?search=distilbart) are described in this [paper](https://arxiv.org/abs/2010.13002).
+- Inputs should be padded on the right because BERT uses absolute position embeddings.
+- The [facebook/bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn) checkpoint doesn't include `mask_token_id` which means it can't perform mask-filling tasks.
+- BART doesn’t use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or [`~PreTrainedTokenizerBase.encode`] to get the proper splitting.
+- The forward pass of [`BartModel`] creates the `decoder_input_ids` if they're not passed. This can be different from other model APIs, but it is a useful feature for mask-filling tasks.
+- Model predictions are intended to be identical to the original implementation when `forced_bos_token_id=0`. This only works if the text passed to `fairseq.encode` begins with a space.
+- [`~GenerationMixin.generate`] should be used for conditional generation tasks like summarization.

 ## BartConfig

--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -151,6 +151,12 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

+## BeitImageProcessorFast
+
+[[autodoc]] BeitImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@ -81,10 +81,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model google-bert/bert-base-uncased --device 0
 ```

 </hfoption>
@ -256,4 +256,4 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran

 [[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput

-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/docs/source/en/model_doc/bertweet.md
+++ b/docs/source/en/model_doc/bertweet.md
@ -16,60 +16,82 @@ rendered properly in your Markdown viewer.

 # BERTweet

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+    ">
 </div>

-## Overview
+## BERTweet

-The BERTweet model was proposed in [BERTweet: A pre-trained language model for English Tweets](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf) by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen.
+[BERTweet](https://huggingface.co/papers/2005.10200) shares the same architecture as [BERT-base](./bert), but it’s pretrained like [RoBERTa](./roberta) on English Tweets. It performs really well on Tweet-related tasks like part-of-speech tagging, named entity recognition, and text classification.

-The abstract from the paper is the following:

-*We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having
-the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et
-al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al.,
-2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks:
-Part-of-speech tagging, Named-entity recognition and text classification.*
+You can find all the original BERTweet checkpoints under the [VinAI Research](https://huggingface.co/vinai?search_models=BERTweet) organization.

-This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The original code can be found [here](https://github.com/VinAIResearch/BERTweet).
+> [!TIP]
+> Refer to the [BERT](./bert) docs for more examples of how to apply BERTweet to different language tasks.

-## Usage example
+The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.

-```python
->>> import torch
->>> from transformers import AutoModel, AutoTokenizer
+<hfoptions id="usage">
+<hfoption id="Pipeline">

->>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+```py
+import torch
+from transformers import pipeline

->>> # For transformers v4.x+:
->>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+pipeline = pipeline(
+    task="fill-mask",
+    model="vinai/bertweet-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create <mask> through a process known as photosynthesis.")
+```
+</hfoption>
+<hfoption id="AutoModel">

->>> # For transformers v3.x:
->>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer

->>> # INPUT TWEET IS ALREADY NORMALIZED!
->>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+tokenizer = AutoTokenizer.from_pretrained(
+   "vinai/bertweet-base",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "vinai/bertweet-base",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to("cuda")

->>> input_ids = torch.tensor([tokenizer.encode(line)])
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits

->>> with torch.no_grad():
-...     features = bertweet(input_ids)  # Models outputs are now tuples
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)

->>> # With TensorFlow 2.0+:
->>> # from transformers import TFAutoModel
->>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+print(f"The predicted token is: {predicted_token}")
 ```

-<Tip> 
+</hfoption>
+<hfoption id="transformers CLI">

-This implementation is the same as BERT, except for tokenization method. Refer to [BERT documentation](bert) for 
-API reference information.  
+```bash
+echo -e "Plants create <mask> through a process known as photosynthesis." | transformers-cli run --task fill-mask --model vinai/bertweet-base --device 0
+```

-</Tip>
+</hfoption>
+</hfoptions>
+
+## Notes
+- Use the [`AutoTokenizer`] or [`BertweetTokenizer`] because it’s preloaded with a custom vocabulary adapted to tweet-specific tokens like hashtags (#), mentions (@), emojis, and common abbreviations. Make sure to also install the [emoji](https://pypi.org/project/emoji/) library.
+- Inputs should be padded on the right (`padding="max_length"`) because BERT uses absolute position embeddings.

 ## BertweetTokenizer

--- a/docs/source/en/model_doc/big_bird.md
+++ b/docs/source/en/model_doc/big_bird.md
@ -14,63 +14,87 @@ rendered properly in your Markdown viewer.

 -->

-# BigBird
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+        <img alt= "Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
 ">
+    </div>
 </div>

-## Overview
+# BigBird

-The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by
-Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
-Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
-based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
-attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
-has been shown that applying sparse, global, and random attention approximates full attention, while being
-computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
-BigBird has shown improved performance on various long document NLP tasks, such as question answering and
-summarization, compared to BERT or RoBERTa.
+[BigBird](https://huggingface.co/papers/2007.14062) is a transformer model built to handle sequence lengths up to 4096 compared to 512 for [BERT](./bert). Traditional transformers struggle with long inputs because attention gets really expensive as the sequence length grows. BigBird fixes this by using a sparse attention mechanism, which means it doesn’t try to look at everything at once. Instead, it mixes in local attention, random attention, and a few global tokens to process the whole input. This combination gives it the best of both worlds. It keeps the computation efficient while still capturing enough of the sequence to understand it well. Because of this, BigBird is great at tasks involving long documents, like question answering, summarization, and genomic applications.

-The abstract from the paper is the following:

-*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
-Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
-length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
-reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
-is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
-theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
-sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
-8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
-BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
-propose novel applications to genomics data.*
+You can find all the original BigBird checkpoints under the [Google](https://huggingface.co/google?search_models=bigbird) organization.

-This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta). The original code can be found
-[here](https://github.com/google-research/bigbird).
+> [!TIP]
+> Click on the BigBird models in the right sidebar for more examples of how to apply BigBird to different language tasks.

-## Usage tips
+The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.

- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
-  **original_full** is advised as there is no benefit in using **block_sparse** attention.
- The code currently uses window size of 3 blocks and 2 global blocks.
- Sequence length must be divisible by block size.
- Current implementation supports only **ITC**.
- Current implementation doesn't support **num_random_blocks = 0**
- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

+```py 
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="fill-mask",
+    model="google/bigbird-roberta-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create [MASK] through a process known as photosynthesis.")
+```
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-roberta-base",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "google/bigbird-roberta-base",
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+!echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google/bigbird-roberta-base --device 0
+```
+</hfoption>
+</hfoptions>
+
+## Notes
+- Inputs should be padded on the right because BigBird uses absolute position embeddings.
+- BigBird supports `original_full` and `block_sparse` attention. If the input sequence length is less than 1024, it is recommended to use `original_full` since sparse patterns don't offer much benefit for smaller inputs.
+- The current implementation uses window size of 3 blocks and 2 global blocks, only supports the ITC-implementation, and doesn't support `num_random_blocks=0`.
+- The sequence length must be divisible by the block size.

 ## Resources

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
+- Read the [BigBird](https://huggingface.co/blog/big-bird) blog post for more details about how its attention works.

 ## BigBirdConfig

--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@ -14,77 +14,121 @@ rendered properly in your Markdown viewer.

 -->

-# BioGPT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# BioGPT

-The BioGPT model was proposed in [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. BioGPT is a domain-specific generative pre-trained Transformer language model for biomedical text generation and mining. BioGPT follows the Transformer language model backbone, and is pre-trained on 15M PubMed abstracts from scratch.
+[BioGPT](https://huggingface.co/papers/2210.10341) is a generative Transformer model based on [GPT-2](./gpt2) and pretrained on 15 million PubMed abstracts. It is designed for biomedical language tasks.

-The abstract from the paper is the following:
+You can find all the original BioGPT checkpoints under the [Microsoft](https://huggingface.co/microsoft?search_models=biogpt) organization.

-*Pre-trained language models have attracted increasing attention in the biomedical domain, inspired by their great success in the general natural language domain. Among the two main branches of pre-trained language models in the general language domain, i.e. BERT (and its variants) and GPT (and its variants), the first one has been extensively studied in the biomedical domain, such as BioBERT and PubMedBERT. While they have achieved great success on a variety of discriminative downstream biomedical tasks, the lack of generation ability constrains their application scope. In this paper, we propose BioGPT, a domain-specific generative Transformer language model pre-trained on large-scale biomedical literature. We evaluate BioGPT on six biomedical natural language processing tasks and demonstrate that our model outperforms previous models on most tasks. Especially, we get 44.98%, 38.42% and 40.76% F1 score on BC5CDR, KD-DTI and DDI end-to-end relation extraction tasks, respectively, and 78.2% accuracy on PubMedQA, creating a new record. Our case study on text generation further demonstrates the advantage of BioGPT on biomedical literature to generate fluent descriptions for biomedical terms.*
+> [!TIP]
+> Click on the BioGPT models in the right sidebar for more examples of how to apply BioGPT to different language tasks.

-This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/BioGPT).
+The example below demonstrates how to generate biomedical text with [`Pipeline`], [`AutoModel`], and also from the command line.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- BioGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left.
- BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
- The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
+```py
+import torch
+from transformers import pipeline

-### Using Scaled Dot Product Attention (SDPA)
-
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-
-```
-from transformers import BioGptForCausalLM
-model = BioGptForCausalLM.from_pretrained("microsoft/biogpt", attn_implementation="sdpa", torch_dtype=torch.float16)
+generator = pipeline(
+    task="text-generation",
+    model="microsoft/biogpt",
+    torch_dtype=torch.float16,
+    device=0,
+)
+result = generator("Ibuprofen is best used for", truncation=True, max_length=50, do_sample=True)[0]["generated_text"]
+print(result)
 ```

-On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, OS Ubuntu 20.04) with `float16` and `microsoft/biogpt` model with a CausalLM head,
-we saw the following speedups during training.
+</hfoption>
+<hfoption id="AutoModel">

-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer

-| num_training_steps | batch_size | seq_len | is cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | sdpa peak mem (MB) | Mem saving (%) |
-|--------------------|------------|---------|---------|----------------------------|---------------------------|-------------|---------------------|--------------------|----------------|
-| 100                | 1          | 128     | False   | 0.038                      | 0.031                     | 21.301      | 1601.862            | 1601.497           | 0.023          |
-| 100                | 1          | 256     | False   | 0.039                      | 0.034                     | 15.084      | 1624.944            | 1625.296           | -0.022         |
-| 100                | 2          | 128     | False   | 0.039                      | 0.033                     | 16.820      | 1624.567            | 1625.296           | -0.045         |
-| 100                | 2          | 256     | False   | 0.065                      | 0.059                     | 10.255      | 1672.164            | 1672.164           | 0.000          |
-| 100                | 4          | 128     | False   | 0.062                      | 0.058                     | 6.998       | 1671.435            | 1672.164           | -0.044         |
-| 100                | 4          | 256     | False   | 0.113                      | 0.100                     | 13.316      | 2350.179            | 1848.435           | 27.144         |
-| 100                | 8          | 128     | False   | 0.107                      | 0.098                     | 9.883       | 2098.521            | 1848.435           | 13.530         |
-| 100                | 8          | 256     | False   | 0.222                      | 0.196                     | 13.413      | 3989.980            | 2986.492           | 33.601         |
+tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
+model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/biogpt",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)

-On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, OS Ubuntu 20.04) with `float16` and `microsoft/biogpt` model with a simple AutoModel head,
-we saw the following speedups during inference.
+input_text = "Ibuprofen is best used for"
+inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

-| num_batches | batch_size | seq_len | is cuda | is half | use mask | Per token latency eager (ms) | Per token latency SDPA (ms) | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) |
-|-------------|------------|---------|---------|---------|----------|------------------------------|-----------------------------|-------------|----------------|--------------|---------------|
-| 50          | 1          | 64      | True    | True    | True     | 0.115                        | 0.098                       | 17.392      | 716.998        | 716.998      | 0.000         |
-| 50          | 1          | 128     | True    | True    | True     | 0.115                        | 0.093                       | 24.640      | 730.916        | 730.916      | 0.000         |
-| 50          | 2          | 64      | True    | True    | True     | 0.114                        | 0.096                       | 19.204      | 730.900        | 730.900      | 0.000         |
-| 50          | 2          | 128     | True    | True    | True     | 0.117                        | 0.095                       | 23.529      | 759.262        | 759.262      | 0.000         |
-| 50          | 4          | 64      | True    | True    | True     | 0.113                        | 0.096                       | 18.325      | 759.229        | 759.229      | 0.000         |
-| 50          | 4          | 128     | True    | True    | True     | 0.186                        | 0.178                       | 4.289       | 816.478        | 816.478      | 0.000         |
+with torch.no_grad():
+    generated_ids = model.generate(**inputs, max_length=50)
+    
+output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+print(output)
+```

+</hfoption>
+<hfoption id="transformers CLI">

-## Resources
+```bash
+echo -e "Ibuprofen is best used for" | transformers-cli run --task text-generation --model microsoft/biogpt --device 0
+```

- [Causal language modeling task guide](../tasks/language_modeling)
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bit precision.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True
+)
+
+tokenizer = AutoTokenizer.from_pretrained("microsoft/BioGPT-Large")
+model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/BioGPT-Large", 
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+
+input_text = "Ibuprofen is best used for"
+inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    generated_ids = model.generate(**inputs, max_length=50)    
+output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+print(output)
+```
+
+## Notes
+
+- Pad inputs on the right because BioGPT uses absolute position embeddings.
+- BioGPT can reuse previously computed key-value attention pairs. Access this feature with the [past_key_values](https://huggingface.co/docs/transformers/main/en/model_doc/biogpt#transformers.BioGptModel.forward.past_key_values) parameter in [`BioGPTModel.forward`].
+- The `head_mask` argument is ignored when using an attention implementation other than "eager". If you want to use `head_mask`, make sure `attn_implementation="eager"`).
+
+   ```py
+   from transformers import AutoModelForCausalLM
+   
+   model = AutoModelForCausalLM.from_pretrained(
+      "microsoft/biogpt",
+      attn_implementation="eager"
+   )

 ## BioGptConfig

@ -108,7 +152,7 @@ we saw the following speedups during inference.
 [[autodoc]] BioGptForCausalLM
    - forward

-    
+
 ## BioGptForTokenClassification

 [[autodoc]] BioGptForTokenClassification
--- a/docs/source/en/model_doc/blenderbot-small.md
+++ b/docs/source/en/model_doc/blenderbot-small.md
@ -21,6 +21,8 @@ rendered properly in your Markdown viewer.
 <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
 ">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

 Note that [`BlenderbotSmallModel`] and
@ -52,7 +54,7 @@ found [here](https://github.com/facebookresearch/ParlAI).

 ## Usage tips

-Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than 
+Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
 the left.


--- a/docs/source/en/model_doc/blenderbot.md
+++ b/docs/source/en/model_doc/blenderbot.md
@ -21,6 +21,8 @@ rendered properly in your Markdown viewer.
 <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
 ">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

 ## Overview
@ -45,7 +47,7 @@ This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The

 ## Usage tips and example

-Blenderbot is a model with absolute position embeddings so it's usually advised to pad the inputs on the right 
+Blenderbot is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
 rather than the left.

 An example:
@ -71,7 +73,7 @@ An example:
  `facebook/blenderbot_small_90M`, have a different architecture and consequently should be used with
  [BlenderbotSmall](blenderbot-small).

-  
+
 ## Resources

 - [Causal language modeling task guide](../tasks/language_modeling)
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -35,7 +35,7 @@ The example below demonstrates how to generate code with [`Pipeline`], or the [`

 <hfoptions id="usage">
 <hfoption id="Pipeline">
-    
+
 ```py
 import torch
 from transformers import pipeline
@ -76,7 +76,7 @@ prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
 input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

 output = model.generate(
-    **input_ids, 
+    **input_ids,
    max_new_tokens=256,
    cache_implementation="static"
 )
@ -92,10 +92,10 @@ print(filled_text)
 ```

 </hfoption>
-<hfoption id="transformers-cli">
-    
+<hfoption id="transformers CLI">
+
 ```bash
-echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers-cli run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
 ```

 </hfoption>
@ -146,7 +146,7 @@ visualizer("""def func(a, b):
 - Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
    ```py
    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
-    
+
    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
    PROMPT = '''def remove_non_ascii(s: str) -> str:
@ -155,7 +155,7 @@ visualizer("""def func(a, b):
    '''
    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
    generated_ids = model.generate(input_ids, max_new_tokens=128)
-    
+
    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
    print(PROMPT.replace("<FILL_ME>", filling))
    ```
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -49,9 +49,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids, 
-    max_new_tokens=100, 
-    do_sample=True, 
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
    temperature=0.3,
    cache_implementation="static",
 )
@ -59,11 +59,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat --model_name_or_path CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
+transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -85,9 +85,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids, 
-    max_new_tokens=100, 
-    do_sample=True, 
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
    temperature=0.3,
    cache_implementation="static",
 )
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@ -20,9 +20,11 @@ rendered properly in your Markdown viewer.

 # ColPali

-[ColPali](https://huggingface.co/papers/2407.01449) is a model designed to retrieve documents by analyzing their visual features. Unlike traditional systems that rely heavily on text extraction and OCR, ColPali treats each page as an image. It uses [Paligemma-3B](./paligemma) to capture not only text, but also the layout, tables, charts, and other visual elements to create detailed embeddings. This offers a more comprehensive understanding of documents and enables more efficient and accurate retrieval.
+[ColPali](https://huggingface.co/papers/2407.01449) is a model designed to retrieve documents by analyzing their visual features. Unlike traditional systems that rely heavily on text extraction and OCR, ColPali treats each page as an image. It uses [Paligemma-3B](./paligemma) to capture not only text, but also the layout, tables, charts, and other visual elements to create detailed multi-vector embeddings that can be used for retrieval by computing pairwise late interaction similarity scores. This offers a more comprehensive understanding of documents and enables more efficient and accurate retrieval.

-You can find all the original ColPali checkpoints under the [ColPali](https://huggingface.co/collections/vidore/hf-native-colvision-models-6755d68fc60a8553acaa96f7) collection.
+This model was contributed by [@tonywu71](https://huggingface.co/tonywu71) (ILLUIN Technology) and [@yonigozlan](https://huggingface.co/yonigozlan) (HuggingFace).
+
+You can find all the original ColPali checkpoints under Vidore's [Hf-native ColVision Models](https://huggingface.co/collections/vidore/hf-native-colvision-models-6755d68fc60a8553acaa96f7) collection.

 > [!TIP]
 > Click on the ColPali models in the right sidebar for more examples of how to use ColPali for image retrieval.
@ -30,21 +32,25 @@ You can find all the original ColPali checkpoints under the [ColPali](https://hu
 <hfoptions id="usage">
 <hfoption id="image retrieval">

-```py
+```python
 import requests
 import torch
 from PIL import Image
+
 from transformers import ColPaliForRetrieval, ColPaliProcessor

-# Load model (bfloat16 support is limited; fallback to float32 if needed)
-model = ColPaliForRetrieval.from_pretrained(
-    "vidore/colpali-v1.2-hf",
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",  # "cpu", "cuda", or "mps" for Apple Silicon
-).eval()

+# Load the model and the processor
+model_name = "vidore/colpali-v1.3-hf"
+
+model = ColPaliForRetrieval.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",  # "cpu", "cuda", or "mps" for Apple Silicon
+)
 processor = ColPaliProcessor.from_pretrained(model_name)

+# The document page screenshots from your corpus
 url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
 url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"

@ -53,25 +59,37 @@ images = [
    Image.open(requests.get(url2, stream=True).raw),
 ]

+# The queries you want to retrieve documents for
 queries = [
-    "Who printed the edition of Romeo and Juliet?",
    "When was the United States Declaration of Independence proclaimed?",
+    "Who printed the edition of Romeo and Juliet?",
 ]

 # Process the inputs
-inputs_images = processor(images=images, return_tensors="pt").to(model.device)
-inputs_text = processor(text=queries, return_tensors="pt").to(model.device)
+inputs_images = processor(images=images).to(model.device)
+inputs_text = processor(text=queries).to(model.device)

 # Forward pass
 with torch.no_grad():
    image_embeddings = model(**inputs_images).embeddings
    query_embeddings = model(**inputs_text).embeddings

+# Score the queries against the images
 scores = processor.score_retrieval(query_embeddings, image_embeddings)

 print("Retrieval scores (query x image):")
 print(scores)
 ```
+
+If you have issue with loading the images with PIL, you can use the following code to create dummy images:
+
+```python
+images = [
+    Image.new("RGB", (128, 128), color="white"),
+    Image.new("RGB", (64, 32), color="black"),
+]
+```
+
 </hfoption>
 </hfoptions>

@ -79,12 +97,15 @@ Quantization reduces the memory burden of large models by representing the weigh

 The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.

-```py
+```python
 import requests
 import torch
 from PIL import Image
-from transformers import ColPaliForRetrieval, ColPaliProcessor
-from transformers import BitsAndBytesConfig
+
+from transformers import BitsAndBytesConfig, ColPaliForRetrieval, ColPaliProcessor
+
+
+model_name = "vidore/colpali-v1.3-hf"

 # 4-bit quantization configuration
 bnb_config = BitsAndBytesConfig(
@ -94,14 +115,11 @@ bnb_config = BitsAndBytesConfig(
    bnb_4bit_compute_dtype=torch.float16,
 )

-model_name = "vidore/colpali-v1.2-hf"
-
-# Load model 
 model = ColPaliForRetrieval.from_pretrained(
    model_name,
    quantization_config=bnb_config,
-    device_map="cuda"
-).eval()
+    device_map="cuda",
+)

 processor = ColPaliProcessor.from_pretrained(model_name)

@ -114,8 +132,8 @@ images = [
 ]

 queries = [
-    "Who printed the edition of Romeo and Juliet?",
    "When was the United States Declaration of Independence proclaimed?",
+    "Who printed the edition of Romeo and Juliet?",
 ]

 # Process the inputs
@ -127,6 +145,7 @@ with torch.no_grad():
    image_embeddings = model(**inputs_images).embeddings
    query_embeddings = model(**inputs_text).embeddings

+# Score the queries against the images
 scores = processor.score_retrieval(query_embeddings, image_embeddings)

 print("Retrieval scores (query x image):")
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -0,0 +1,176 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# ColQwen2
+
+[ColQwen2](https://doi.org/10.48550/arXiv.2407.01449) is a variant of the [ColPali](./colpali) model designed to retrieve documents by analyzing their visual features. Unlike traditional systems that rely heavily on text extraction and OCR, ColQwen2 treats each page as an image. It uses the [Qwen2-VL](./qwen2_vl) backbone to capture not only text, but also the layout, tables, charts, and other visual elements to create detailed multi-vector embeddings that can be used for retrieval by computing pairwise late interaction similarity scores. This offers a more comprehensive understanding of documents and enables more efficient and accurate retrieval.
+
+This model was contributed by [@tonywu71](https://huggingface.co/tonywu71) (ILLUIN Technology) and [@yonigozlan](https://huggingface.co/yonigozlan) (HuggingFace).
+
+You can find all the original ColPali checkpoints under Vidore's [Hf-native ColVision Models](https://huggingface.co/collections/vidore/hf-native-colvision-models-6755d68fc60a8553acaa96f7) collection.
+
+> [!TIP]
+> Click on the ColQwen2 models in the right sidebar for more examples of how to use ColQwen2 for image retrieval.
+
+<hfoptions id="usage">
+<hfoption id="image retrieval">
+
+```python
+import requests
+import torch
+from PIL import Image
+
+from transformers import ColQwen2ForRetrieval, ColQwen2Processor
+from transformers.utils.import_utils import is_flash_attn_2_available
+
+
+# Load the model and the processor
+model_name = "vidore/colqwen2-v1.0-hf"
+
+model = ColQwen2ForRetrieval.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",  # "cpu", "cuda", or "mps" for Apple Silicon
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa",
+)
+processor = ColQwen2Processor.from_pretrained(model_name)
+
+# The document page screenshots from your corpus
+url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
+url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"
+
+images = [
+    Image.open(requests.get(url1, stream=True).raw),
+    Image.open(requests.get(url2, stream=True).raw),
+]
+
+# The queries you want to retrieve documents for
+queries = [
+    "When was the United States Declaration of Independence proclaimed?",
+    "Who printed the edition of Romeo and Juliet?",
+]
+
+# Process the inputs
+inputs_images = processor(images=images).to(model.device)
+inputs_text = processor(text=queries).to(model.device)
+
+# Forward pass
+with torch.no_grad():
+    image_embeddings = model(**inputs_images).embeddings
+    query_embeddings = model(**inputs_text).embeddings
+
+# Score the queries against the images
+scores = processor.score_retrieval(query_embeddings, image_embeddings)
+
+print("Retrieval scores (query x image):")
+print(scores)
+```
+
+If you have issue with loading the images with PIL, you can use the following code to create dummy images:
+
+```python
+images = [
+    Image.new("RGB", (128, 128), color="white"),
+    Image.new("RGB", (64, 32), color="black"),
+]
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.
+
+```python
+import requests
+import torch
+from PIL import Image
+
+from transformers import BitsAndBytesConfig, ColQwen2ForRetrieval, ColQwen2Processor
+
+
+model_name = "vidore/colqwen2-v1.0-hf"
+
+# 4-bit quantization configuration
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+
+model = ColQwen2ForRetrieval.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map="cuda",
+).eval()
+
+processor = ColQwen2Processor.from_pretrained(model_name)
+
+url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
+url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"
+
+images = [
+    Image.open(requests.get(url1, stream=True).raw),
+    Image.open(requests.get(url2, stream=True).raw),
+]
+
+queries = [
+    "When was the United States Declaration of Independence proclaimed?",
+    "Who printed the edition of Romeo and Juliet?",
+]
+
+# Process the inputs
+inputs_images = processor(images=images, return_tensors="pt").to(model.device)
+inputs_text = processor(text=queries, return_tensors="pt").to(model.device)
+
+# Forward pass
+with torch.no_grad():
+    image_embeddings = model(**inputs_images).embeddings
+    query_embeddings = model(**inputs_text).embeddings
+
+# Score the queries against the images
+scores = processor.score_retrieval(query_embeddings, image_embeddings)
+
+print("Retrieval scores (query x image):")
+print(scores)
+```
+
+## Notes
+
+- [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
+- Unlike ColPali, ColQwen2 supports arbitrary image resolutions and aspect ratios, which means images are not resized into fixed-size squares. This preserves more of the original input signal.
+- Larger input images generate longer multi-vector embeddings, allowing users to adjust image resolution to balance performance and memory usage.
+
+## ColQwen2Config
+
+[[autodoc]] ColQwen2Config
+
+## ColQwen2Processor
+
+[[autodoc]] ColQwen2Processor
+
+## ColQwen2ForRetrieval
+
+[[autodoc]] ColQwen2ForRetrieval
+    - forward
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -0,0 +1,382 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Csm
+
+## Overview
+
+The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.
+
+**Model Architecture:**
+CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
+
+The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/csm_architecture.png"/>
+</div>
+
+## Usage Tips
+
+### Without Conversational Context
+
+CSM can be used to simply generate speech from a text prompt:
+
+```python
+import torch
+from transformers import CsmForConditionalGeneration, AutoProcessor
+
+model_id = "sesame/csm-1b"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# prepare the inputs
+text = "[0]The past is just a story we tell ourselves." # `[0]` for speaker id 0
+inputs = processor(text, add_special_tokens=True).to(device)
+
+# another equivalent way to prepare the inputs
+conversation = [
+    {"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+# infer the model
+audio = model.generate(**inputs, output_audio=True)
+processor.save_audio(audio, "example_without_context.wav")
+```
+
+### With Conversational Context
+
+CSM can be used to generate speech given a conversation, allowing consistency in the voices and content-aware generation:
+
+```python
+import torch
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset, Audio
+
+model_id = "sesame/csm-1b"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# prepare the inputs
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+# ensure the audio is 24kHz
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+conversation = []
+
+# 1. context
+for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
+    conversation.append(
+        {
+            "role": f"{speaker_id}",
+            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
+        }
+    )
+
+# 2. text prompt
+conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+# infer the model
+audio = model.generate(**inputs, output_audio=True)
+processor.save_audio(audio, "example_with_context.wav")
+```
+
+### Batched Inference
+
+CSM supports batched inference!
+
+```python
+import torch
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset, Audio
+
+model_id = "sesame/csm-1b"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# prepare the inputs 
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+# ensure the audio is 24kHz
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+# here a batch with two prompts
+conversation = [
+    [
+        {
+            "role": f"{ds[0]['speaker_id']}",
+            "content": [
+                {"type": "text", "text": ds[0]["text"]},
+                {"type": "audio", "path": ds[0]["audio"]["array"]},
+            ],
+        },
+        {
+            "role": f"{ds[1]['speaker_id']}",
+            "content": [
+                {"type": "text", "text": ds[1]["text"]},
+            ],
+        },
+    ],
+    [
+        {
+            "role": f"{ds[0]['speaker_id']}",
+            "content": [
+                {"type": "text", "text": ds[0]["text"]},
+            ],
+        }
+    ],
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+audio = model.generate(**inputs, output_audio=True)
+processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
+```
+
+### Making The Model Go Brrr
+
+CSM supports full-graph compilation with CUDA graphs!
+
+```python
+import torch
+import copy
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset
+
+model_id = "sesame/csm-1b"
+device = "cuda"
+
+# set logs to ensure no recompilation and graph breaks
+torch._logging.set_logs(graph_breaks=True, recompiles=True, cudagraphs=True)
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# use static cache, enabling automatically torch compile with fullgraph and reduce-overhead
+model.generation_config.max_length = 250 # big enough to avoid recompilation
+model.generation_config.max_new_tokens = None # would take precedence over max_length
+model.generation_config.cache_implementation = "static"
+model.depth_decoder.generation_config.cache_implementation = "static"
+
+# generation kwargs
+gen_kwargs = {
+    "do_sample": False,
+    "depth_decoder_do_sample": False,
+    "temperature": 1.0,
+    "depth_decoder_temperature": 1.0,
+}
+
+# Define a timing decorator
+class TimerContext:
+    def __init__(self, name="Execution"):
+        self.name = name
+        self.start_event = None
+        self.end_event = None
+        
+    def __enter__(self):
+        # Use CUDA events for more accurate GPU timing
+        self.start_event = torch.cuda.Event(enable_timing=True)
+        self.end_event = torch.cuda.Event(enable_timing=True)
+        self.start_event.record()
+        return self
+
+    def __exit__(self, *args):
+        self.end_event.record()
+        torch.cuda.synchronize()
+        elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0
+        print(f"{self.name} time: {elapsed_time:.4f} seconds")
+
+# prepare the inputs 
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+
+conversation = [
+    {
+        "role": f"{ds[0]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[0]["text"]},
+            {"type": "audio", "path": ds[0]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[1]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[1]["text"]},
+            {"type": "audio", "path": ds[1]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[2]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[2]["text"]},
+        ],
+    },
+]
+
+padded_inputs_1 = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+print("\n" + "="*50)
+print("First generation - compiling and recording CUDA graphs...")
+with TimerContext("First generation"):
+    _ = model.generate(**padded_inputs_1, **gen_kwargs)
+print("="*50)
+
+print("\n" + "="*50)
+print("Second generation - fast !!!")
+with TimerContext("Second generation"):
+    _ = model.generate(**padded_inputs_1, **gen_kwargs)
+print("="*50)
+
+# now with different inputs
+conversation = [
+    {
+        "role": f"{ds[0]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[2]["text"]},
+            {"type": "audio", "path": ds[2]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[1]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[3]["text"]},
+            {"type": "audio", "path": ds[3]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[2]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[4]["text"]},
+        ],
+    },
+]
+padded_inputs_2 = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+print("\n" + "="*50)
+print("Generation with other inputs!")
+with TimerContext("Generation with different inputs"):
+    _ = model.generate(**padded_inputs_2, **gen_kwargs)
+print("="*50)
+```
+
+### Training
+
+CSM Transformers integration supports training!
+
+```python
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset, Audio
+
+model_id = "sesame/csm-1b"
+device = "cuda"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+model.train()
+model.codec_model.eval()
+
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+# ensure the audio is 24kHz
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+conversation = []
+
+# context
+for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
+    conversation.append(
+        {
+            "role": f"{speaker_id}",
+            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
+        }
+    )
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+    output_labels=True,
+).to(device)
+
+out = model(**inputs)
+out.loss.backward()
+```
+
+This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
+The original code can be found [here](https://github.com/SesameAILabs/csm).
+
+
+## CsmConfig
+
+[[autodoc]] CsmConfig
+
+## CsmDepthDecoderConfig
+
+[[autodoc]] CsmDepthDecoderConfig
+
+## CsmProcessor
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/fig1.jpg"/>
+</div>
+
+[[autodoc]] CsmProcessor
+    - __call__
+
+## CsmForConditionalGeneration
+
+[[autodoc]] CsmForConditionalGeneration
+    - forward
+    - generate
+
+## CsmDepthDecoderForCausalLM
+
+[[autodoc]] CsmDepthDecoderForCausalLM
+
+## CsmDepthDecoderModel
+
+[[autodoc]] CsmDepthDecoderModel
+
+## CsmBackboneModel
+
+[[autodoc]] CsmBackboneModel
--- a/docs/source/en/model_doc/data2vec.md
+++ b/docs/source/en/model_doc/data2vec.md
@ -53,6 +53,7 @@ The original code for vision can be found [here](https://github.com/facebookrese
 - For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
 - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
 - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/deepseek_v3.md
+++ b/docs/source/en/model_doc/deepseek_v3.md
@ -28,8 +28,8 @@ We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 67
 We are super happy to make this code community-powered, and would love to see how you can best optimize the following: 

 - current implementation uses the "naive" attention compution (so not really MLA)
- current implementation loops through the experts. This should be replaced. Pointers to use `get_packed_weights` from `intetrations/tensor_parallel`. 
- current implementation uses the eleuther formula for ROPE, using the orginal one would be more efficient! (should still follow our API)
+- current implementation loops through the experts. This should be replaced. Pointers to use `get_packed_weights` from `integrations/tensor_parallel`. 
+- current implementation uses the eleuther formula for ROPE, using the original one would be more efficient! (should still follow our API)
 - static cache is not supported (this should be just a generation config issue / config shape issues)

 ### Usage tips
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@ -111,33 +111,68 @@ print("Predicted class:", model.config.id2label[predicted_class_idx])

 ## Notes

- Use [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) to speedup inference. However, it will produce some mismatched elements. The difference between the original and traced model is 1e-4.
+- The example below shows how to split the output tensor into:
+  - one embedding for the whole image, commonly referred to as a `CLS` token,
+    useful for classification and retrieval
+  - a set of local embeddings, one for each `14x14` patch of the input image,
+    useful for dense tasks, such as semantic segmentation

-    ```py
-    import torch
-    from transformers import AutoImageProcessor, AutoModel
-    from PIL import Image
-    import requests
+  ```py
+  from transformers import AutoImageProcessor, AutoModel
+  from PIL import Image
+  import requests
+  
+  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+  image = Image.open(requests.get(url, stream=True).raw)
+  print(image.height, image.width)  # [480, 640]
+  
+  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+  model = AutoModel.from_pretrained('facebook/dinov2-base')
+  patch_size = model.config.patch_size
+  
+  inputs = processor(images=image, return_tensors="pt")
+  print(inputs.pixel_values.shape)  # [1, 3, 224, 224]
+  batch_size, rgb, img_height, img_width = inputs.pixel_values.shape
+  num_patches_height, num_patches_width = img_height // patch_size, img_width // patch_size
+  num_patches_flat = num_patches_height * num_patches_width
+  
+  outputs = model(**inputs)
+  last_hidden_states = outputs[0]
+  print(last_hidden_states.shape)  # [1, 1 + 256, 768]
+  assert last_hidden_states.shape == (batch_size, 1 + num_patches_flat, model.config.hidden_size)
+  
+  cls_token = last_hidden_states[:, 0, :]
+  patch_features = last_hidden_states[:, 1:, :].unflatten(1, (num_patches_height, num_patches_width))
+  ```

-    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-    image = Image.open(requests.get(url, stream=True).raw)
+- Use [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) to speedup inference.
+  However, it will produce some mismatched elements. The difference between the original and traced model is 1e-4.

-    processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
-    model = AutoModel.from_pretrained('facebook/dinov2-base')
-
-    inputs = processor(images=image, return_tensors="pt")
-    outputs = model(**inputs)
-    last_hidden_states = outputs[0]
-
-    # We have to force return_dict=False for tracing
-    model.config.return_dict = False
-
-    with torch.no_grad():
-        traced_model = torch.jit.trace(model, [inputs.pixel_values])
-        traced_outputs = traced_model(inputs.pixel_values)
-
-    print((last_hidden_states - traced_outputs[0]).abs().max())
-    ```
+  ```py
+  import torch
+  from transformers import AutoImageProcessor, AutoModel
+  from PIL import Image
+  import requests
+  
+  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+  image = Image.open(requests.get(url, stream=True).raw)
+  
+  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+  model = AutoModel.from_pretrained('facebook/dinov2-base')
+  
+  inputs = processor(images=image, return_tensors="pt")
+  outputs = model(**inputs)
+  last_hidden_states = outputs[0]
+  
+  # We have to force return_dict=False for tracing
+  model.config.return_dict = False
+  
+  with torch.no_grad():
+      traced_model = torch.jit.trace(model, [inputs.pixel_values])
+      traced_outputs = traced_model(inputs.pixel_values)
+  
+  print((last_hidden_states - traced_outputs[0]).abs().max())
+  ```

 ## Dinov2Config

--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@ -83,10 +83,10 @@ print(f"Predicted label: {predicted_label}")

 </hfoption>

-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
+echo -e "I love using Hugging Face Transformers!" | transformers run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
 ```

 </hfoption>
@ -213,7 +213,3 @@ echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task

 </jax>
 </frameworkcontent>
-
-
-
-
--- a/docs/source/en/model_doc/electra.md
+++ b/docs/source/en/model_doc/electra.md
@ -45,9 +45,9 @@ import torch
 from transformers import pipeline

 classifier = pipeline(
-    task="text-classification", 
-    model="bhadresh-savani/electra-base-emotion", 
-    torch_dtype=torch.float16, 
+    task="text-classification",
+    model="bhadresh-savani/electra-base-emotion",
+    torch_dtype=torch.float16,
    device=0
 )
 classifier("This restaurant has amazing food!")
@ -64,7 +64,7 @@ tokenizer = AutoTokenizer.from_pretrained(
    "bhadresh-savani/electra-base-emotion",
 )
 model = AutoModelForSequenceClassification.from_pretrained(
-    "bhadresh-savani/electra-base-emotion", 
+    "bhadresh-savani/electra-base-emotion",
    torch_dtype=torch.float16
 )
 inputs = tokenizer("ELECTRA is more efficient than BERT", return_tensors="pt")
@ -78,10 +78,10 @@ print(f"Predicted label: {predicted_label}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "This restaurant has amazing food." | transformers-cli run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
+echo -e "This restaurant has amazing food." | transformers run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
 ```

 </hfoption>
@ -96,12 +96,12 @@ echo -e "This restaurant has amazing food." | transformers-cli run --task text-c

    ```py
    # Example of properly handling padding with attention masks
-    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"], 
-                    padding=True, 
+    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
+                    padding=True,
                    return_tensors="pt")
    outputs = model(**inputs)  # automatically uses the attention_mask
    ```
-    
+
 - When using the discriminator for a downstream task, you can load it into any of the ELECTRA model classes ([`ElectraForSequenceClassification`], [`ElectraForTokenClassification`], etc.).

 ## ElectraConfig
--- a/docs/source/en/model_doc/emu3.md
+++ b/docs/source/en/model_doc/emu3.md
@ -174,6 +174,10 @@ for i, image in enumerate(images['pixel_values']):
 [[autodoc]] Emu3TextModel
    - forward

+## Emu3Model
+
+[[autodoc]] Emu3Model
+
 ## Emu3ForCausalLM

 [[autodoc]] Emu3ForCausalLM
--- a/docs/source/en/model_doc/falcon.md
+++ b/docs/source/en/model_doc/falcon.md
@ -41,7 +41,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    task="text-generation", 
+    task="text-generation",
    model="tiiuae/falcon-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -76,11 +76,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat --model_name_or_path tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers chat tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -150,4 +150,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ## FalconForQuestionAnswering

 [[autodoc]] FalconForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/falcon_h1.md
+++ b/docs/source/en/model_doc/falcon_h1.md
@ -0,0 +1,65 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# FalconH1
+
+## Overview
+
+The FalconH1 model was developed by the TII Pretraining team. A comprehensive research paper covering the architecture, pretraining dynamics, experimental results, and conclusions is forthcoming. You can read more about this series in [this website](https://github.com/tiiuae/Falcon-H1).
+
+## Contributors
+
+This model was contributed by [DhiyaEddine](https://huggingface.co/DhiyaEddine), [ybelkada](https://huggingface.co/ybelkada), [JingweiZuo](https://huggingface.co/JingweiZuo), [IlyasChahed](https://huggingface.co/IChahed), and [MaksimVelikanov](https://huggingface.co/yellowvm).
+The original code can be found [here](https://github.com/tiiuae/Falcon-H1).
+
+
+## FalconH1Config
+
+| Model     | Depth | Dim  | Attn Heads | KV | Mamba Heads | d_head       | d_state | Ctx Len        |
+|-----------|--------|------|------------|----|--------------|--------------|------|-----------------|
+| H1 0.5B   | 36     | 1024 | 8          | 2  | 24           | 64 / 64      | 128  | 4K, 16K-SFT     |
+| H1 1.5B   | 24     | 2048 | 8          | 2  | 48           | 128 / 64     | 256  | 128K            |
+| H1 1.5B-d | 66     | 1280 | 6          | 2  | 24           | 128 / 64     | 256  | 128K            |
+| H1 3B     | 32     | 2560 | 10         | 2  | 32           | 128 / 128    | 256  | 128K            |
+| H1 7B     | 44     | 3072 | 12         | 2  | 24           | 128 / 128    | 256  | 256K            |
+| H1 34B    | 72     | 5120 | 20         | 4  | 32           | 128 / 128    | 256  | 256K            |
+
+
+
+[[autodoc]] FalconH1Config
+
+<!---
+## Usage Tips
+Tips: 
+- The architecture is based on Mamba-2 models.
+## FalconH1Model
+[[autodoc]] FalconH1Model
+    - forward
+-->
+
+## FalconH1ForCausalLM
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("tiiuae/Falcon-H1-7B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/Falcon-H1-7B-Instruct")
+
+message = ["Mamba is a snake with following properties  "]
+inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False)
+response = model.generate(**inputs, max_new_tokens=64)
+print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
+```
+
+[[autodoc]] FalconH1ForCausalLM
+    - forward
+
+This HF implementation is contributed by [younesbelkada](https://github.com/younesbelkada) and [DhiaEddineRhaiem](https://github.com/dhiaEddineRhaiem). 
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -39,7 +39,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    "text-generation", 
+    "text-generation",
    model="tiiuae/falcon-mamba-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -73,10 +73,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-transformers-cli chat --model_name_or_path tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
+transformers chat tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -103,6 +103,10 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.

 [[autodoc]] FuyuConfig

+## FuyuModel
+
+[[autodoc]] FuyuModel
+
 ## FuyuForCausalLM

 [[autodoc]] FuyuForCausalLM
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -80,10 +80,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "LLMs generate text through a process known as" | transformers-cli run --task text-generation --model google/gemma-2b --device 0
+echo -e "LLMs generate text through a process known as" | transformers run --task text-generation --model google/gemma-2b --device 0
 ```

 </hfoption>
@ -114,8 +114,8 @@ model = AutoModelForCausalLM.from_pretrained(
 input_text = "LLMs generate text through a process known as."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 outputs = model.generate(
-    **input_ids, 
-    max_new_tokens=50, 
+    **input_ids,
+    max_new_tokens=50,
    cache_implementation="static"
 )
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@ -127,7 +127,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
 from transformers.utils.attention_visualizer import AttentionMaskVisualizer

 visualizer = AttentionMaskVisualizer("google/gemma-2b")
-visualizer("LLMs generate text through a process known as") 
+visualizer("LLMs generate text through a process known as")
 ```

 <div class="flex justify-center">
--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -58,7 +58,7 @@ pipe("Explain quantum computing simply. ", max_new_tokens=50)

 </hfoption>
 <hfoption id="AutoModel">
-    
+
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
@ -80,16 +80,16 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```
-echo -e "Explain quantum computing simply." | transformers-cli run --task text-generation --model google/gemma-2-2b --device 0
+echo -e "Explain quantum computing simply." | transformers run --task text-generation --model google/gemma-2-2b --device 0
 ```
 </hfoption>
 </hfoptions>

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-	
+
 The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.

 ```python
@ -118,7 +118,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
 ```python
 from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 visualizer = AttentionMaskVisualizer("google/gemma-2b")
-visualizer("You are an assistant. Make sure you print me") 
+visualizer("You are an assistant. Make sure you print me")
 ```

 <div class="flex justify-center">
@ -137,7 +137,7 @@ visualizer("You are an assistant. Make sure you print me")

    inputs = tokenizer(text="My name is Gemma", return_tensors="pt")
    max_generated_length = inputs.input_ids.shape[1] + 10
-    past_key_values = HybridCache(config=model.config, max_batch_size=1, 
+    past_key_values = HybridCache(config=model.config, max_batch_size=1,
    max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
    outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    ```
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@ -28,7 +28,7 @@ rendered properly in your Markdown viewer.

 The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.

-You can find all the original Gemma 3 checkpoints under the [Gemma 3](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) release.
+You can find all the original Gemma 3 checkpoints under the [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) release.

 > [!TIP]
 > Click on the Gemma 3 models in the right sidebar for more examples of how to apply Gemma to different vision and language tasks.
@ -99,10 +99,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model google/gemma-3-1b-pt --device 0
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3-1b-pt --device 0
 ```

 </hfoption>
@ -254,6 +254,10 @@ visualizer("<img>What is shown in this image?")
 [[autodoc]] Gemma3TextModel
    - forward

+## Gemma3Model
+
+[[autodoc]] Gemma3Model
+
 ## Gemma3ForCausalLM

 [[autodoc]] Gemma3ForCausalLM
--- a/docs/source/en/model_doc/got_ocr2.md
+++ b/docs/source/en/model_doc/got_ocr2.md
@ -277,6 +277,10 @@ alt="drawing" width="600"/>

 [[autodoc]] GotOcr2Processor

+## GotOcr2Model
+
+[[autodoc]] GotOcr2Model
+
 ## GotOcr2ForConditionalGeneration

 [[autodoc]] GotOcr2ForConditionalGeneration
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@ -64,15 +64,21 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Hello, I'm a language model" | transformers-cli run --task text-generation --model openai-community/gpt2 --device 0
+echo -e "Hello, I'm a language model" | transformers run --task text-generation --model openai-community/gpt2 --device 0
 ```

 </hfoption>
 </hfoptions>

+One can also serve the model using vLLM with the `transformers backend`.
+
+```
+vllm serve openai-community/gpt2 --model-imp transformers
+```
+
 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

 The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
@ -82,16 +88,16 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,  
-    bnb_4bit_quant_type="nf4",  
-    bnb_4bit_compute_dtype="float16",  
-    bnb_4bit_use_double_quant=True 
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype="float16",
+    bnb_4bit_use_double_quant=True
 )

 model = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2-xl",
    quantization_config=quantization_config,
-    device_map="auto"  
+    device_map="auto"
 )

 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
--- a/docs/source/en/model_doc/gpt_bigcode.md
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@ -46,8 +46,12 @@ The main differences compared to GPT2.
 - Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
 - Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model).

+
 You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Combining Starcoder and Flash Attention 2

 First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
--- a/docs/source/en/model_doc/gpt_neo.md
+++ b/docs/source/en/model_doc/gpt_neo.md
@ -14,93 +14,94 @@ rendered properly in your Markdown viewer.

 -->

-# GPT Neo
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-</div>
-
-## Overview
-
-The GPTNeo model was released in the [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) repository by Sid
-Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like causal language model trained on the
-[Pile](https://pile.eleuther.ai/) dataset.
-
-The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
-256 tokens.
-
-This model was contributed by [valhalla](https://huggingface.co/valhalla).
-
-## Usage example
-
-The `generate()` method can be used to generate text using GPT Neo model.
-
-```python
->>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
-
->>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
->>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
-
->>> prompt = (
-...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
-...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
-...     "researchers was the fact that the unicorns spoke perfect English."
-... )
-
->>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
->>> gen_tokens = model.generate(
-...     input_ids,
-...     do_sample=True,
-...     temperature=0.9,
-...     max_length=100,
-... )
->>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
-```
-
-## Combining GPT-Neo and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature, and make sure your hardware is compatible with Flash-Attention 2. More details are available [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2) concerning the installation.
-
-Make sure as well to load your model in half-precision (e.g. `torch.float16`).
-
-To load and run a model using Flash Attention 2, refer to the snippet below:
-
-```python
->>> import torch
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
->>> device = "cuda" # the device to load the model onto
-
->>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
->>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
-
->>> prompt = "def hello_world():"
-
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
->>> model.to(device)
-
->>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
->>> tokenizer.batch_decode(generated_ids)[0]
-"def hello_world():\n    >>> run_script("hello.py")\n    >>> exit(0)\n<|endoftext|>"
-```
-
-### Expected speedups
-
-Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `EleutherAI/gpt-neo-2.7B` checkpoint and the Flash Attention 2 version of the model.
-Note that for GPT-Neo it is not possible to train / run on very long context as the max [position embeddings](https://huggingface.co/EleutherAI/gpt-neo-2.7B/blob/main/config.json#L58 ) is limited to 2048 - but this is applicable to all gpt-neo models and not specific to FA-2
-
-<div style="text-align: center">
-<img src="https://user-images.githubusercontent.com/49240599/272241893-b1c66b75-3a48-4265-bc47-688448568b3d.png">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    </div>
 </div>


-## Resources
+## GPT-Neo

- [Text classification task guide](../tasks/sequence_classification)
- [Causal language modeling task guide](../tasks/language_modeling)
+[GPT-Neo](https://zenodo.org/records/5297715) is an open-source alternative to GPT-2 and GPT-3 models, built with Mesh TensorFlow for TPUs. GPT-Neo uses local attention in every other layer for more efficiency. It is trained on the [Pile](https://huggingface.co/datasets/EleutherAI/pile), a diverse dataset consisting of 22 smaller high-quality datasets.
+
+
+You can find all the original GPT-Neo checkpoints under the [EleutherAI](https://huggingface.co/EleutherAI?search_models=gpt-neo) organization.
+
+> [!TIP]
+> Click on the GPT-Neo models in the right sidebar for more examples of how to apply GPT Neo to different language tasks.
+
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(task="text-generation", model="EleutherAI/gpt-neo-1.3B", torch_dtype=torch.float16, device=0)
+pipeline("Hello, I'm a language model")
+```
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype=torch.float16, device_map="auto", attn_implementation="flash_attention_2")
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
+
+input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Hello, I'm a language model" | transformers-cli run --task text-generation --model EleutherAI/gpt-neo-1.3B --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype="float16",
+    bnb_4bit_use_double_quant=True
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "EleutherAI/gpt-neo-2.7B",
+    quantization_config=quantization_config,
+    device_map="auto"
+)
+
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
+inputs = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- Pad inputs on the right because GPT-Neo uses absolute position embeddings.

 ## GPTNeoConfig

--- a/Show More
+++ b/Show More