Release: v4.52.4

[qwen-vl] Look for vocab size in text config (#38372 )
fix qwen
2025-10-21 01:23:56 +08:00 · 2025-05-30 11:14:55 +02:00 · 2025-05-30 11:05:43 +02:00 · 2025-05-28 09:31:32 +02:00 · 2025-05-28 09:31:32 +02:00 · 2025-05-28 09:31:32 +02:00
2512 changed files with 124953 additions and 177408 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,6 +7,18 @@ parameters:
    nightly:
        type: boolean
        default: false
+    GHA_Actor:
+        type: string
+        default: ""
+    GHA_Action:
+        type: string
+        default: ""
+    GHA_Event:
+        type: string
+        default: ""
+    GHA_Meta:
+        type: string
+        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
@ -31,8 +43,10 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
+            - run: python3 utils/extract_pr_number_from_circleci.py > pr_number.txt
+            - run: echo $(cat pr_number.txt)
+            - run: if [[ "$(cat pr_number.txt)" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
+            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$(cat pr_number.txt) >> github.txt'
            - run: cat github.txt
            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,6 +28,8 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
+    # will be adjust in `CircleCIJob.to_dict`.
+    "RUN_FLAKY": True,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
 COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
@ -108,6 +110,7 @@ class CircleCIJob:
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
            self.install_steps = ["uv venv && uv pip install ."]
+        self.install_steps.append("uv venv && uv pip install git+https://github.com/ydshieh/pytest.git@8.3.5-ydshieh git+https://github.com/ydshieh/pluggy.git@1.5.0-ydshieh")
        if self.pytest_options is None:
            self.pytest_options = {}
        if isinstance(self.tests_to_run, str):
@ -126,6 +129,8 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
+        # Do not run tests decorated by @is_flaky on pull requests
+        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
@ -393,7 +398,12 @@ def create_circleci_config(folder=None):
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": ''},
+            # Only used to accept the parameters from GitHub Actions trigger
+            "GHA_Actor": {"type": "string", "default": ""},
+            "GHA_Action": {"type": "string", "default": ""},
+            "GHA_Event": {"type": "string", "default": ""},
+            "GHA_Meta": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ""},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
@ -56,6 +56,12 @@ body:
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
          - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
+        
+        Devices/Backends:
+        
+          - AMD ROCm: @ivarflakstad
+          - Intel XPU: @IlyasMoutawwakil
+          - Ascend NPU: @ivarflakstad 

        Documentation: @stevhliu

--- a/.github/ISSUE_TEMPLATE/i18n.md
+++ b/.github/ISSUE_TEMPLATE/i18n.md
@ -23,7 +23,7 @@ Some notes:
 * Please translate in a gender-neutral way.
 * Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source).
 * Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml).
-* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu and @MKhalusova for review.
+* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu for review.
 * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/).

 ## Get Started section
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/scripts/assign_reviewers.py
+++ b/.github/scripts/assign_reviewers.py
@ -54,6 +54,21 @@ def get_file_owners(file_path, codeowners_lines):
            return owners  # Remember, can still be empty!
    return []  # Should never happen, but just in case

+def pr_author_is_in_hf(pr_author, codeowners_lines):
+    # Check if the PR author is in the codeowners file
+    for line in codeowners_lines:
+        line = line.split('#')[0].strip()
+        if not line:
+            continue
+
+        # Split into pattern and owners
+        parts = line.split()
+        owners = [owner.removeprefix("@") for owner in parts[1:]]
+
+        if pr_author in owners:
+            return True
+    return False
+
 def main():
    script_dir = Path(__file__).parent.absolute()
    with open(script_dir / "codeowners_for_review_action") as f:
@ -68,6 +83,9 @@ def main():
    pr_number = event['pull_request']['number']
    pr = repo.get_pull(pr_number)
    pr_author = pr.user.login
+    if pr_author_is_in_hf(pr_author, codeowners_lines):
+        print(f"PR author {pr_author} is in codeowners, skipping review request.")
+        return

    existing_reviews = list(pr.get_reviews())
    if existing_reviews:
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -63,14 +63,14 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build 
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-general-8-plus
+      group: aws-g4dn-2xlarge-cache
    steps:
      -
        name: Set up Docker Buildx
@ -99,7 +99,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build 
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -140,7 +140,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build 
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -176,7 +176,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-doc-builder docker build 
+          title: 🤗 Results of the huggingface/transformers-doc-builder docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -214,7 +214,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build 
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -223,19 +223,19 @@ jobs:
    runs-on:
      group: aws-general-8-plus
    steps:
-      - 
+      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      - 
+      -
        name: Check out code
        uses: actions/checkout@v4
-      - 
+      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      - 
+      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -263,7 +263,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build 
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -301,7 +301,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build 
+          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -310,19 +310,19 @@ jobs:
    runs-on:
      group: aws-general-8-plus
    steps:
-      - 
+      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      - 
+      -
        name: Check out code
        uses: actions/checkout@v4
-      - 
+      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      - 
+      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -350,7 +350,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build 
+          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -388,6 +388,6 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-quantization-latest-gpu build 
+          title: 🤗 Results of the transformers-quantization-latest-gpu build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@ -42,7 +42,7 @@ jobs:
  nightly-torch-deepspeed-docker:
    name: "Nightly PyTorch + DeepSpeed"
    runs-on:
-      group: aws-general-8-plus
+      group: aws-g4dn-2xlarge-cache
    steps:
      -
        name: Set up Docker Buildx
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,4 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: ar de en es fr hi it ko pt tr zh ja te
+      languages: en
--- a/.github/workflows/change_pr_to_draft.yml
+++ b/.github/workflows/change_pr_to_draft.yml
@ -1,25 +0,0 @@
-name: Change PR to draft
-
-on:
-  pull_request_target:
-    types: [opened, reopened]
-
-jobs:
-  convert_pr_to_draft:
-    runs-on: ubuntu-22.04
-    name: Convert PR to draft
-    permissions:
-      pull-requests: write
-      contents: write
-    if: github.event.pull_request.draft == false
-    steps:
-      - name: Convert PR to draft
-        shell: bash
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO: ${{ github.repository }}
-        run: |
-          echo $PR_NUMBER
-          gh pr ready $PR_NUMBER --repo $REPO --undo
-          gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. The CI will be paused while the PR is in draft mode. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page). This will assign reviewers and trigger CI."
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -29,7 +29,7 @@ jobs:
  run_models_gpu:
    name: " "
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -18,6 +18,10 @@ on:
      docker:
        required: true
        type: string
+      report_name_prefix:
+        required: false
+        default: run_models_gpu
+        type: string

 env:
  HF_HOME: /mnt/cache
@ -103,7 +107,7 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -116,23 +120,23 @@ jobs:

      - name: Run all tests on GPU
        working-directory: /transformers
-        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt

      - name: Run test
        shell: bash
        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -59,7 +59,7 @@ jobs:
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
                  }
                }
              ]
--- a/.github/workflows/pr-style-bot.yml
+++ b/.github/workflows/pr-style-bot.yml
@ -0,0 +1,19 @@
+# To run this bot, comment "@bot /style" on a PR
+name: Style Bot
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  style:
+    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
+    with:
+      python_quality_dependencies: "[quality]"
+      style_command_type: "default"
+    secrets:
+      bot_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -145,7 +145,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,7 +239,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,7 +338,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -54,12 +54,23 @@ jobs:
      ci_event: Daily CI
    secrets: inherit

+  trainer-fsdp-ci:
+    name: Trainer/FSDP CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_trainer_and_fsdp_gpu
+      slack_report_channel: "#transformers-ci-daily-training"
+      runner: daily-ci
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+    secrets: inherit
+
  deepspeed-ci:
    name: DeepSpeed CI
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-deepspeed"
+      slack_report_channel: "#transformers-ci-daily-training"
      runner: daily-ci
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -45,11 +45,11 @@ env:

 jobs:
  setup:
-    if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job)
+    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -77,12 +77,17 @@ jobs:
        run: pip freeze

      - id: set-matrix
-        if: ${{ inputs.job == 'run_models_gpu' }}
+        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
-          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
+            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
+            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
+          fi

      - id: set-matrix-quantization
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
@ -102,7 +107,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -113,13 +118,32 @@ jobs:
      docker: ${{ inputs.docker }}
    secrets: inherit

+  run_trainer_and_fsdp_gpu:
+    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        slice_id: [0, 1]
+    uses: ./.github/workflows/model_jobs.yml
+    with:
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      machine_type: ${{ matrix.machine_type }}
+      slice_id: ${{ matrix.slice_id }}
+      runner: ${{ inputs.runner }}
+      docker: ${{ inputs.docker }}
+      report_name_prefix: run_trainer_and_fsdp_gpu
+    secrets: inherit
+
  run_pipelines_torch_gpu:
    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: PyTorch pipelines
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -153,7 +177,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -187,7 +211,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -222,7 +246,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -256,7 +280,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -290,7 +314,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -325,7 +349,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -382,12 +406,12 @@ jobs:
        run: pip freeze

      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -424,7 +448,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -467,7 +491,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -541,6 +565,7 @@ jobs:
    needs: [
      setup,
      run_models_gpu,
+      run_trainer_and_fsdp_gpu,
      run_pipelines_torch_gpu,
      run_pipelines_tf_gpu,
      run_examples_gpu,
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -35,7 +35,7 @@ jobs:
        shell: bash
        run: |
          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers-cli env
+transformers env
 ```

 You can also run the same command from the root of the repository:
--- a/ISSUES.md
+++ b/ISSUES.md
@ -26,7 +26,7 @@ There are two main venues to receive support: [the forums](https://discuss.huggi

 [The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.

-If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
+If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystallized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).

 In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:

--- a/2
+++ b/2
@ -79,7 +79,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py  --fix_and_overwrite
+	python utils/check_modular_conversion.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
--- a/README.md
+++ b/README.md
@ -70,7 +70,7 @@ Explore the [Hub](https://huggingface.com/) today to find a model and use Transf

 ## Installation

-Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.0+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.
+Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.

 Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.

@ -78,7 +78,6 @@ Create and activate a virtual environment with [venv](https://docs.python.org/3/
 # venv
 python -m venv .my-env
 source .my-env/bin/activate
-
 # uv
 uv venv .my-env
 source .my-env/bin/activate
@ -88,10 +87,10 @@ Install Transformers in your virtual environment.

 ```py
 # pip
-pip install transformers
+pip install "transformers[torch]"

 # uv
-uv pip install transformers
+uv pip install "transformers[torch]"
 ```

 Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
@ -99,7 +98,12 @@ Install Transformers from source if you want the latest changes in the library o
 ```shell
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .
+
+# pip
+pip install .[torch]
+
+# uv
+uv pip install .[torch]
 ```

 ## Quickstart
@ -121,7 +125,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
 > [!TIP]
 > You can also chat with a model directly from the command line.
 > ```shell
-> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+> transformers chat Qwen/Qwen2.5-0.5B-Instruct
 > ```

 ```py
--- a/SECURITY.md
+++ b/SECURITY.md
@ -27,13 +27,6 @@ These models require the `trust_remote_code=True` parameter to be set when using
 the content of the modeling files when using this argument. We recommend setting a revision in order to ensure you
 protect yourself from updates on the repository.

-#### Tools
-
-Through the `Agent` framework, remote tools can be downloaded to be used by the Agent. You're to specify these tools
-yourself, but please keep in mind that their code will be run on your machine if the Agent chooses to run them.
-
-Please inspect the code of the tools before passing them to the Agent to protect your runtime and local setup.
-
 ## Reporting a Vulnerability

 Feel free to submit vulnerability reports to [security@huggingface.co](mailto:security@huggingface.co), where someone from the HF security team will review and recommend next steps. If reporting a vulnerability specific to open source, please note [Huntr](https://huntr.com) is a vulnerability disclosure program for open source software.
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):

        model = benchmark.config.backend["model"]

-        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
        benchmark_name = str(Path(benchmark_name).parts[-1])
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -293,7 +293,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
                max_cache_len=seq_length + 128,
            )

-            # 3nd call
+            # 3rd call
            start = perf_counter()
            output = model.generate(**inputs, past_key_values=past_key_values)
            end = perf_counter()
--- a/conftest.py
+++ b/conftest.py
@ -66,7 +66,6 @@ NOT_DEVICE_TESTS = {
    "ModelTester::test_pipeline_",
    "/repo_utils/",
    "/utils/",
-    "/agents/",
 }

 # allow having multiple repository checkouts and not needing to remember to rerun
@ -83,7 +82,6 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
-    config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")


--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -16,7 +16,7 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,7 +7,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -14,6 +14,8 @@ ARG PYTORCH='2.6.0'
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu121'
+# Disable kernel mapping for now until all tests pass
+ENV DISABLE_KERNEL_MAPPING=1

 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
@ -69,6 +71,9 @@ RUN python3 -m pip install --no-cache-dir g2p-en
 # For Some bitsandbytes tests
 RUN python3 -m pip install --no-cache-dir einops

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -1,12 +1,12 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
-FROM nvcr.io/nvidia/pytorch:23.11-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

-ARG PYTORCH='2.2.0'
+ARG PYTORCH='2.6.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu121'
+ARG CUDA='cu126'

 RUN apt -y update
 RUN apt install -y libaio-dev
@ -15,7 +15,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
+RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'

 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
@ -44,6 +45,9 @@ RUN python3 -m pip uninstall -y deepspeed
 # TODO: Find out why test fail.
 RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -1,11 +1,11 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
-FROM nvcr.io/nvidia/pytorch:23.11-py3
+FROM nvcr.io/nvidia/pytorch:24.08-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu121'
+ARG CUDA='cu126'

 RUN apt -y update
 RUN apt install -y libaio-dev
@ -21,7 +21,8 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
 RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

-RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
+RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -56,6 +57,9 @@ RUN python3 -m pip uninstall -y deepspeed
 #RUN git clone https://github.com/pytorch/TensorRT.git
 #RUN cd TensorRT/py && python3 setup.py install --fx-only

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -28,6 +28,9 @@ RUN python3 -m pip uninstall -y tensorflow flax
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -12,6 +12,8 @@ SHELL ["sh", "-lc"]
 ARG PYTORCH='2.6.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu121'
+# Disable kernel mapping for quantization tests
+ENV DISABLE_KERNEL_MAPPING=1

 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
@ -82,9 +84,15 @@ RUN python3 -m pip install --no-cache-dir compressed-tensors
 # Add AMD Quark for quantization testing
 RUN python3 -m pip install --no-cache-dir amd-quark

+# Add AutoRound for quantization testing
+RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
+
 # Add transformers in editable mode
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -23,8 +23,6 @@
    title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT
  - local: model_sharing
    title: مشاركة نموذجك
-  - local: agents
-    title: الوكلاء
  - local: llm_tutorial
    title: التوليد باستخدام LLMs
  - local: conversations
@ -252,8 +250,6 @@
  title: أطر مفاهيمية
 # - sections:
 #   - sections:
-#     - local: main_classes/agent
-#       title: الوكلاء والأدوات
 #     - local: model_doc/auto
 #       title: فئات يتم إنشاؤها ديناميكيًا
 #     - local: main_classes/backbones
--- a/docs/source/ar/agents.md
+++ b/docs/source/ar/agents.md
@ -1,539 +0,0 @@
-# الوكلاء والأدوات
-
-[[open-in-colab]]
-
-### ما هو الوكيل؟
-
-يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها.
-
-يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل".
-
-الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات".
-
-هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح.
-
-يمكن برمجة الوكيل للقيام بما يلي:
- وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال
- التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال
-
-### أنواع الوكلاء
-
-#### الوكيل البرمجي (Code agent)
-
-يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط.
-
-#### وكلاء التفاعل
-
-هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة.
-
-نقوم بتنفيذ إصدارين من ReactJsonAgent: 
- [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها.
- [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء  قوي في البرمجة.
-
-> [!TIP]
-> اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct.
-
-![إطار عمل وكيل ReAct](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
-
-على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي.
-
-```py3
->>> agent.run(
-...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
-... )
-=====New task=====
-How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
-====Agent is executing the code below:
-bert_blocks = search(query="number of blocks in BERT base encoder")
-print("BERT blocks:", bert_blocks)
-====
-Print outputs:
-BERT blocks: twelve encoder blocks
-
-====Agent is executing the code below:
-attention_layer = search(query="number of layers in Attention is All You Need")
-print("Attention layers:", attention_layer)
-====
-Print outputs:
-Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
-
-====Agent is executing the code below:
-bert_blocks = 12
-attention_layers = 6
-diff = bert_blocks - attention_layers
-print("Difference in blocks:", diff)
-final_answer(diff)
-====
-
-Print outputs:
-Difference in blocks: 6
-
-Final answer: 6
-```
-
-### كيف يمكنني بناء وكيل؟
-
-لتهيئة وكيل، تحتاج إلى هذه الوسائط:
-
- نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له.
- موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته.
- صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها
- محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها
-
-عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا.
-
-للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية.
-
-```bash
-pip install transformers[agents]
-```
-
-قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد.
-
-```python
-from huggingface_hub import login, InferenceClient
-
-login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
-
-client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
-
-def llm_engine(messages, stop_sequences=["Task"]) -> str:
-    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
-    answer = response.choices[0].message.content
-    return answer
-```
-
-يمكنك استخدام أي طريقة `llm_engine` طالما أنها:
-1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str`
-2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop`
-
-أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`.
-
-الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى.
-
-```python
-from transformers import CodeAgent, HfEngine
-
-llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run(
-    "Could you translate this sentence from French, say it out loud and return the audio.",
-    sentence="Où est la boulangerie la plus proche?",
-)
-```
-
-هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي.
-
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], add_base_tools=True)
-
-agent.run(
-    "Could you translate this sentence from French, say it out loud and give me the audio.",
-    sentence="Où est la boulangerie la plus proche?",
-)
-```
-
-لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج.
-
-يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها:
-
-```py
-from transformers import ReactCodeAgent
-
-agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
-```
-
-
-تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك.
-
-```python
-print(agent.system_prompt_template)
-```
-
-من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها.
-كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا.
-يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`.
-
-
-#### تنفيذ التعليمات البرمجية
-
-يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك.
-يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه.
-
-مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة.
-يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل  `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]:
-
-```py
->>> from transformers import ReactCodeAgent
-
->>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
->>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
-
-(...)
-'Hugging Face – Blog'
-```
-
-سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل.
-
-> [!WARNING]
-> يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة!
-
-### موجه النظام
-
-ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً).
-
-```text
-You will be given a task to solve as best you can.
-You have access to the following tools:
-<<tool_descriptions>>
-
-To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
-
-At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
-Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
-During each intermediate step, you can use 'print()' to save whatever important information you will then need.
-These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
-
-In the end you have to return a final answer using the `final_answer` tool.
-
-Here are a few examples using notional tools:
---
-{examples}
-
-Above example were using notional tools that might not exist for you. You only have access to those tools:
-<<tool_names>>
-You also can perform computations in the python code you generate.
-
-Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
-
-Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
-Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
-
-Remember to make sure that variables you use are all defined.
-
-Now Begin!
-```
-
-يتضمن موجه النظام:
- *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها.
- وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<<tool_descriptions>>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها.
-    - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه.
- شكل المخرج المتوقع.
-
-يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات.
-
-للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`.
-
-```python
-from transformers import ReactJsonAgent
-from transformers.agents import PythonInterpreterTool
-
-agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
-```
-
-> [!WARNING]
-> يرجى التأكد من تحديد سلسلة `<<tool_descriptions>>` في مكان ما في `template` حتى يكون الوكيل على علم 
-بالأدوات المتاحة.
-
-
-### فحص تشغيل الوكيل
-
-فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل:
- تخزن  `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`.
- تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة.
-
-## الأدوات
-
-الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة.
-
-يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة.
-
-عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا.
-
-### صندوق الأدوات الافتراضي
-
-يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`:
-
- **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut))
- **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt))
- **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper))
- **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5))
- **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف.
- **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python
-لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS:
-
-يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها.
-
-```python
-from transformers import load_tool
-
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
-```
-
-### إنشاء أداة جديدة
-
-يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face.
-على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub.
-
-سوف نبدأ بالكود التالي.
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-
-يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`].
-
-تحتاج الأداة المخصصة إلى:
-
- اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`.
- تستخدم خاصية `description` لملء موجه نظام الوكيل.
- خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات.
- خاصية `output_type`، والتي تحدد نوع المخرج.
- طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية.
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = (
-        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
-        "It returns the name of the checkpoint."
-    )
-
-    inputs = {
-        "task": {
-            "type": "text",
-            "description": "the task category (such as text-classification, depth-estimation, etc)",
-        }
-    }
-    output_type = "text"
-
-    def forward(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام.
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة.
-
-```python
-tool.push_to_hub("{your_username}/hf-model-downloads")
-```
-
-قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك.
-
-```python
-from transformers import load_tool, CodeAgent
-
-model_download_tool = load_tool("m-ric/hf-model-downloads")
-agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
-agent.run(
-    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
-)
-```
-
-ستحصل على ما يلي:
-
-```text
-======== New task ========
-Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
-==== Agent is executing the code below:
-most_downloaded_model = model_download_counter(task="text-to-video")
-print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
-====
-```
-
-والناتج:
-
-`"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."`
-
-### إدارة صندوق أدوات الوكيل الخاص بك
-
-إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة.
-
-دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي.
-
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-agent.toolbox.add_tool(model_download_tool)
-```
-
-الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة:
-
-```python
-    agent.run(
-        "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
-    )
-```
-
-| **Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-
-> [!WARNING]
-> احترس عند إضافة أدوات إلى وكيل يعمل بالفعل لأنه يمكن أن يؤثر على اختيار الأداة لصالح أداتك أو اختيار أداة أخرى غير المحددة بالفعل.
-
-استخدم طريقة `agent.toolbox.update_tool()` لاستبدال أداة موجودة في صندوق أدوات الوكيل.
-هذا مفيد إذا كانت أداتك الجديدة بديلاً مباشرًا للأداة الموجودة لأن الوكيل يعرف بالفعل كيفية تنفيذ تلك المهمة المحددة.
-تأكد فقط من اتباع الأداة الجديدة لنفس واجهة برمجة التطبيقات (API) للأداة المستبدلة أو قم بتكييف قالب موجه النظام لضمان تحديث جميع الأمثلة التي تستخدم الأداة المستبدلة.
-
-### استخدام مجموعة من الأدوات
-
-يمكنك الاستفادة من مجموعات الأدوات باستخدام كائن ToolCollection، مع تحديد مجموعة الأدوات التي تريد استخدامها.
-ثم قم بتمريرها كقائمة لتهيئة الوكيل الخاص بك، وبدء استخدامها!
-
-```py
-from transformers import ToolCollection, ReactCodeAgent
-
-image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
-agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
-
-agent.run("Please draw me a picture of rivers and lakes.")
-```
-
-لتسريع البداية، يتم تحميل الأدوات فقط إذا استدعاها الوكيل.
-
-ستحصل على هذه الصورة:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" />
-
-### استخدام gradio-tools
-
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools) هي مكتبة قوية تتيح استخدام Hugging
-Face Spaces كأدوات. تدعم العديد من المساحات الموجودة بالإضافة إلى مساحات مخصصة.
-
-تدعم مكتبة Transformers `gradio_tools` باستخدام طريقة [`Tool.from_gradio`] في الفئة. على سبيل المثال، دعنا نستخدم [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) من مجموعة أدوات `gradio-tools` لتحسين المطالبات لإنشاء صور أفضل.
-
-استورد وقم بتهيئة الأداة، ثم مررها إلى طريقة `Tool.from_gradio`:
-
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-from transformers import Tool, load_tool, CodeAgent
-
-gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
-prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
-```
-
-الآن يمكنك استخدامه مثل أي أداة أخرى. على سبيل المثال، دعنا نحسن الموجه `a rabbit wearing a space suit`.
-
-```python
-image_generation_tool = load_tool('huggingface-tools/text-to-image')
-agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine)
-
-agent.run(
-    "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
-)
-```
-
-يستفيد النموذج بشكل كافٍ من الأداة:
-
-```text
-======== New task ========
-Improve this prompt, then generate an image of it.
-You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}.
-==== Agent is executing the code below:
-improved_prompt = StableDiffusionPromptGenerator(query=prompt)
-while improved_prompt == "QUEUE_FULL":
-    improved_prompt = StableDiffusionPromptGenerator(query=prompt)
-print(f"The improved prompt is {improved_prompt}.")
-image = image_generator(prompt=improved_prompt)
-====
-```
-
-قبل إنشاء الصورة أخيرًا:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp" />
-
-> [!WARNING]
-> تتطلب gradio-tools إدخالات وإخراجات *نصية* حتى عند العمل مع طرائق مختلفة مثل كائنات الصور والصوت. الإدخالات والإخراجات الصورية والصوتية غير متوافقة حاليًا.
-
-### استخدام أدوات LangChain
-
-نحن نحب Langchain ونعتقد أنها تحتوي على مجموعة أدوات قوية للغاية.
-لاستيراد أداة من LangChain، استخدم الطريقة `from_langchain()`.
-
-فيما يلي كيفية استخدامها لإعادة إنشاء نتيجة البحث في المقدمة باستخدام أداة بحث الويب LangChain.
-
-```python
-from langchain.agents import load_tools
-from transformers import Tool, ReactCodeAgent
-
-search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
-
-agent = ReactCodeAgent(tools=[search_tool])
-
-agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
-```
-
-## واجهة Gradio
-
-يمكنك الاستفادة من `gradio.Chatbot` لعرض أفكار الوكيل الخاص بك باستخدام `stream_to_gradio`، إليك مثال:
-
-```py
-import gradio as gr
-from transformers import (
-    load_tool,
-    ReactCodeAgent,
-    HfEngine,
-    stream_to_gradio,
-)
-
-# Import tool from Hub
-image_generation_tool = load_tool("m-ric/text-to-image")
-
-llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
-
-# Initialize the agent with the image generation tool
-agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
-
-
-def interact_with_agent(task):
-    messages = []
-    messages.append(gr.ChatMessage(role="user", content=task))
-    yield messages
-    for msg in stream_to_gradio(agent, task):
-        messages.append(msg)
-        yield messages + [
-            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
-        ]
-    yield messages
-
-
-with gr.Blocks() as demo:
-    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
-    submit = gr.Button("Run illustrator agent!")
-    chatbot = gr.Chatbot(
-        label="Agent",
-        type="messages",
-        avatar_images=(
-            None,
-            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
-        ),
-    )
-    submit.click(interact_with_agent, [text_input], [chatbot])
-
-if __name__ == "__main__":
-    demo.launch()
-```
--- a/docs/source/ar/gguf.md
+++ b/docs/source/ar/gguf.md
@ -77,7 +77,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)

 الآن لديك إمكانية الوصول إلى النسخة الكامل غير المكممة للنموذج في بيئة PyTorch، حيث يمكنك دمجه مع مجموعة كبيرة من الأدوات الأخرى.

-لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) من llama.cpp.
+لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) من llama.cpp.

 فيما يلي كيفية إكمال البرنامج النصي أعلاه لحفظ النموذج وإعادة تصديره مرة أخرى إلى `gguf`:

--- a/docs/source/ar/trainer.md
+++ b/docs/source/ar/trainer.md
@ -674,29 +674,7 @@ use_cpu: false
 ```

 </hfoption>
-<hfoption id="Tensor Parallelism with PyTorch 2">

-```yml
-compute_environment: LOCAL_MACHINE
-tp_config:
-  tp_size: 4
-distributed_type: TP
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'no'
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-
-```
-
-</hfoption>
 </hfoptions>
 يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.

--- a/docs/source/de/_toctree.yml
+++ b/docs/source/de/_toctree.yml
@ -23,8 +23,6 @@
    title: Laden und Trainieren von Adaptern mit 🤗 PEFT
  - local: model_sharing
    title: Ein Modell teilen
-  - local: transformers_agents
-    title: Agents
  - local: llm_tutorial
    title: Generation with LLMs
  title: Tutorials
@ -39,4 +37,4 @@
    title: Testen
  - local: pr_checks
    title: Überprüfung einer Pull Request
-  title: Contribute
+  title: Contribute
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -95,7 +95,7 @@ wie der Code geschrieben werden sollte :-)
 1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from).
 2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
@ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d
 ein bestehendes Modell:

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb
 Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:

 ```bash
-transformers-cli env
+transformers env
 ```

 Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
--- a/docs/source/de/transformers_agents.md
+++ b/docs/source/de/transformers_agents.md
@ -1,323 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Transformers Agents
-
-<Tip warning={true}>
-
-Transformers Agents ist eine experimentelle API, die jederzeit geändert werden kann. Die von den Agenten zurückgegebenen Ergebnisse
-zurückgegeben werden, können variieren, da sich die APIs oder die zugrunde liegenden Modelle ändern können.
-
-</Tip>
-
-Transformers Version v4.29.0, die auf dem Konzept von *Tools* und *Agenten* aufbaut. Sie können damit spielen in
-[dieses Colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
-
-Kurz gesagt, es bietet eine API für natürliche Sprache auf der Grundlage von Transformers: Wir definieren eine Reihe von kuratierten Tools und entwerfen einen 
-Agenten, um natürliche Sprache zu interpretieren und diese Werkzeuge zu verwenden. Es ist von vornherein erweiterbar; wir haben einige relevante Tools kuratiert, 
-aber wir werden Ihnen zeigen, wie das System einfach erweitert werden kann, um jedes von der Community entwickelte Tool zu verwenden.
-
-Beginnen wir mit einigen Beispielen dafür, was mit dieser neuen API erreicht werden kann. Sie ist besonders leistungsfähig, wenn es um 
-Sie ist besonders leistungsstark, wenn es um multimodale Aufgaben geht. Lassen Sie uns also eine Runde drehen, um Bilder zu erzeugen und Text vorzulesen.
-
-```py
-agent.run("Caption the following image", image=image)
-```
-
-| **Input**                                                                                                                   | **Output**                        |
-|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
-| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
-
---
-
-```py
-agent.run("Read the following text out loud", text=text)
-```
-| **Input**                                                                                                               | **Output**                                   |
-|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
-| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
-
---
-
-```py
-agent.run(
-    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
-    document=document,
-)
-```
-| **Input**                                                                                                                   | **Output**     |
-|-----------------------------------------------------------------------------------------------------------------------------|----------------|
-| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
-
-## Schnellstart
-
-Bevor Sie `agent.run` verwenden können, müssen Sie einen Agenten instanziieren, der ein großes Sprachmodell (LLM) ist. 
-Wir bieten Unterstützung für openAI-Modelle sowie für OpenSource-Alternativen von BigCode und OpenAssistant. Die openAI
-Modelle sind leistungsfähiger (erfordern aber einen openAI-API-Schlüssel, können also nicht kostenlos verwendet werden); Hugging Face
-bietet kostenlosen Zugang zu Endpunkten für BigCode- und OpenAssistant-Modelle.
-
-To start with, please install the `agents` extras in order to install all default dependencies.
-```bash
-pip install transformers[agents]
-```
-
-Um openAI-Modelle zu verwenden, instanziieren Sie einen [`OpenAiAgent`], nachdem Sie die `openai`-Abhängigkeit installiert haben:
-
-```bash
-pip install openai
-```
-
-
-```py
-from transformers import OpenAiAgent
-
-agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
-```
-
-Um BigCode oder OpenAssistant zu verwenden, melden Sie sich zunächst an, um Zugriff auf die Inference API zu erhalten:
-
-```py
-from huggingface_hub import login
-
-login("<YOUR_TOKEN>")
-```
-
-Dann instanziieren Sie den Agenten
-
-```py
-from transformers import HfAgent
-
-# Starcoder
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-# StarcoderBase
-# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
-# OpenAssistant
-# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
-```
-
-Dies geschieht mit der Inferenz-API, die Hugging Face derzeit kostenlos zur Verfügung stellt. Wenn Sie Ihren eigenen Inferenz
-Endpunkt für dieses Modell (oder einen anderen) haben, können Sie die obige URL durch Ihren URL-Endpunkt ersetzen.
-
-<Tip>
-
-StarCoder und OpenAssistant sind kostenlos und leisten bei einfachen Aufgaben bewundernswert gute Arbeit. Allerdings halten die Kontrollpunkte
-nicht, wenn es um komplexere Aufforderungen geht. Wenn Sie mit einem solchen Problem konfrontiert sind, empfehlen wir Ihnen, das OpenAI
-Modell auszuprobieren, das zwar leider nicht quelloffen ist, aber zur Zeit eine bessere Leistung erbringt.
-
-</Tip>
-
-Sie sind jetzt startklar! Lassen Sie uns in die beiden APIs eintauchen, die Ihnen jetzt zur Verfügung stehen.
-
-### Einzelne Ausführung (run)
-
-Die Methode der einmaligen Ausführung ist die Verwendung der [`~Agent.run`] Methode des Agenten:
-
-```py
-agent.run("Draw me a picture of rivers and lakes.")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
-
-Es wählt automatisch das (oder die) Werkzeug(e) aus, das (die) für die von Ihnen gewünschte Aufgabe geeignet ist (sind) und führt es (sie) entsprechend aus. Es
-kann eine oder mehrere Aufgaben in der gleichen Anweisung ausführen (je komplexer Ihre Anweisung ist, desto wahrscheinlicher ist ein
-der Agent scheitern).
-
-```py
-agent.run("Draw me a picture of the sea then transform the picture to add an island")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
-
-<br/>
-
-
-Jede [`~Agent.run`] Operation ist unabhängig, so dass Sie sie mehrmals hintereinander mit unterschiedlichen Aufgaben ausführen können.
-
-Beachten Sie, dass Ihr `Agent` nur ein großsprachiges Modell ist, so dass kleine Variationen in Ihrer Eingabeaufforderung völlig unterschiedliche Ergebnisse liefern können.
-unterschiedliche Ergebnisse liefern. Es ist wichtig, dass Sie die Aufgabe, die Sie ausführen möchten, so genau wie möglich erklären. Wir gehen noch weiter ins Detail
-wie man gute Prompts schreibt [hier](custom_tools#writing-good-user-inputs).
-
-Wenn Sie einen Status über Ausführungszeiten hinweg beibehalten oder dem Agenten Nicht-Text-Objekte übergeben möchten, können Sie dies tun, indem Sie
-Variablen, die der Agent verwenden soll. Sie könnten zum Beispiel das erste Bild von Flüssen und Seen erzeugen, 
-und das Modell bitten, dieses Bild zu aktualisieren und eine Insel hinzuzufügen, indem Sie Folgendes tun:
-
-```python
-picture = agent.run("Generate a picture of rivers and lakes.")
-updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
-```
-
-<Tip>
-
-Dies kann hilfreich sein, wenn das Modell Ihre Anfrage nicht verstehen kann und die Werkzeuge verwechselt. Ein Beispiel wäre:
-
-```py
-agent.run("Draw me the picture of a capybara swimming in the sea")
-```
-
-Hier könnte das Modell auf zwei Arten interpretieren:
- Die Funktion `Text-zu-Bild` erzeugt ein Wasserschwein, das im Meer schwimmt.
- Oder Sie lassen das `Text-zu-Bild` ein Wasserschwein erzeugen und verwenden dann das Werkzeug `Bildtransformation`, um es im Meer schwimmen zu lassen.
-
-Falls Sie das erste Szenario erzwingen möchten, können Sie dies tun, indem Sie die Eingabeaufforderung als Argument übergeben:
-
-```py
-agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
-```
-
-</Tip>
-
-
-### Chat-basierte Ausführung (Chat)
-
-Der Agent verfügt auch über einen Chat-basierten Ansatz, der die Methode [`~Agent.chat`] verwendet:
-
-```py
-agent.chat("Generate a picture of rivers and lakes")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-```py
-agent.chat("Transform the picture so that there is a rock in there")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
-
-<br/>
-
-Dies ist ein interessanter Ansatz, wenn Sie den Zustand über Anweisungen hinweg beibehalten möchten. Er ist besser für Experimente geeignet, 
-eignet sich aber eher für einzelne Anweisungen als für komplexe Anweisungen (die die [`~Agent.run`]
-Methode besser verarbeiten kann).
-
-Diese Methode kann auch Argumente entgegennehmen, wenn Sie Nicht-Text-Typen oder bestimmte Aufforderungen übergeben möchten.
-
-### ⚠️ Fernausführung
-
-Zu Demonstrationszwecken und damit es mit allen Setups verwendet werden kann, haben wir Remote-Executors für mehrere 
-der Standard-Tools erstellt, auf die der Agent in dieser Version Zugriff hat. Diese werden erstellt mit 
-[inference endpoints](https://huggingface.co/inference-endpoints).
-
-Wir haben diese vorerst deaktiviert, aber um zu sehen, wie Sie selbst Remote Executors Tools einrichten können,
-empfehlen wir die Lektüre des [custom tool guide](./custom_tools).
-
-### Was passiert hier? Was sind Tools und was sind Agenten?
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
-
-#### Agenten
-
-Der "Agent" ist hier ein großes Sprachmodell, das wir auffordern, Zugang zu einem bestimmten Satz von Tools zu erhalten.
-
-LLMs sind ziemlich gut darin, kleine Codeproben zu erzeugen. Diese API macht sich das zunutze, indem sie das 
-LLM ein kleines Codebeispiel gibt, das eine Aufgabe mit einer Reihe von Werkzeugen ausführt. Diese Aufforderung wird dann ergänzt durch die 
-Aufgabe, die Sie Ihrem Agenten geben, und die Beschreibung der Werkzeuge, die Sie ihm geben. Auf diese Weise erhält er Zugriff auf die Dokumentation der 
-Tools, insbesondere die erwarteten Eingaben und Ausgaben, und kann den entsprechenden Code generieren.
-
-#### Tools
-
-Tools sind sehr einfach: Sie bestehen aus einer einzigen Funktion mit einem Namen und einer Beschreibung. Wir verwenden dann die Beschreibungen dieser Tools 
-um den Agenten aufzufordern. Anhand der Eingabeaufforderung zeigen wir dem Agenten, wie er die Tools nutzen kann, um das zu tun, was in der 
-in der Abfrage angefordert wurde.
-
-Dies geschieht mit brandneuen Tools und nicht mit Pipelines, denn der Agent schreibt besseren Code mit sehr atomaren Tools. 
-Pipelines sind stärker refaktorisiert und fassen oft mehrere Aufgaben in einer einzigen zusammen. Tools sind dafür gedacht, sich auf
-eine einzige, sehr einfache Aufgabe konzentrieren.
-
-#### Code-Ausführung?!
-
-Dieser Code wird dann mit unserem kleinen Python-Interpreter auf den mit Ihren Tools übergebenen Eingaben ausgeführt. 
-Wir hören Sie schon schreien "Willkürliche Codeausführung!", aber lassen Sie uns erklären, warum das nicht der Fall ist.
-
-Die einzigen Funktionen, die aufgerufen werden können, sind die von Ihnen zur Verfügung gestellten Tools und die Druckfunktion, so dass Sie bereits eingeschränkt sind 
-eingeschränkt, was ausgeführt werden kann. Sie sollten sicher sein, wenn es sich auf die Werkzeuge für das Umarmungsgesicht beschränkt. 
-
-Dann lassen wir keine Attributsuche oder Importe zu (die ohnehin nicht benötigt werden, um die 
-Inputs/Outputs an eine kleine Gruppe von Funktionen), so dass alle offensichtlichen Angriffe (und Sie müssten den LLM 
-dazu auffordern, sie auszugeben) kein Problem darstellen sollten. Wenn Sie auf Nummer sicher gehen wollen, können Sie die 
-run()-Methode mit dem zusätzlichen Argument return_code=True ausführen. In diesem Fall gibt der Agent nur den auszuführenden Code 
-zur Ausführung zurück und Sie können entscheiden, ob Sie ihn ausführen möchten oder nicht.
-
-Die Ausführung bricht bei jeder Zeile ab, in der versucht wird, eine illegale Operation auszuführen, oder wenn ein regulärer Python-Fehler 
-mit dem vom Agenten generierten Code.
-
-### Ein kuratierter Satz von Tools
-
-Wir haben eine Reihe von Tools identifiziert, die solche Agenten unterstützen können. Hier ist eine aktualisierte Liste der Tools, die wir integriert haben 
-in `transformers` integriert haben:
-
- **Beantwortung von Fragen zu Dokumenten**: Beantworten Sie anhand eines Dokuments (z.B. PDF) im Bildformat eine Frage zu diesem Dokument ([Donut](./model_doc/donut))
- Beantworten von Textfragen**: Geben Sie einen langen Text und eine Frage an, beantworten Sie die Frage im Text ([Flan-T5](./model_doc/flan-t5))
- **Unbedingte Bildunterschriften**: Beschriften Sie das Bild! ([BLIP](./model_doc/blip))
- **Bildfragebeantwortung**: Beantworten Sie bei einem Bild eine Frage zu diesem Bild ([VILT](./model_doc/vilt))
- **Bildsegmentierung**: Geben Sie ein Bild und einen Prompt an und geben Sie die Segmentierungsmaske dieses Prompts aus ([CLIPSeg](./model_doc/clipseg))
- **Sprache in Text**: Geben Sie eine Audioaufnahme einer sprechenden Person an und transkribieren Sie die Sprache in Text ([Whisper](./model_doc/whisper))
- **Text in Sprache**: wandelt Text in Sprache um ([SpeechT5](./model_doc/speecht5))
- **Zero-Shot-Textklassifizierung**: Ermitteln Sie anhand eines Textes und einer Liste von Bezeichnungen, welcher Bezeichnung der Text am ehesten entspricht ([BART](./model_doc/bart))
- **Textzusammenfassung**: fassen Sie einen langen Text in einem oder wenigen Sätzen zusammen ([BART](./model_doc/bart))
- **Übersetzung**: Übersetzen des Textes in eine bestimmte Sprache ([NLLB](./model_doc/nllb))
-
-Diese Tools sind in Transformatoren integriert und können auch manuell verwendet werden, zum Beispiel:
-
-```py
-from transformers import load_tool
-
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
-```
-
-### Benutzerdefinierte Tools
-
-Wir haben zwar eine Reihe von Tools identifiziert, sind aber der festen Überzeugung, dass der Hauptwert dieser Implementierung darin besteht 
-die Möglichkeit, benutzerdefinierte Tools schnell zu erstellen und weiterzugeben.
-
-Indem Sie den Code eines Tools in einen Hugging Face Space oder ein Modell-Repository stellen, können Sie das Tool 
-direkt mit dem Agenten nutzen. Wir haben ein paar neue Funktionen hinzugefügt 
-**transformers-agnostic** Tools zur [`huggingface-tools` Organisation](https://huggingface.co/huggingface-tools) hinzugefügt:
-
- **Text-Downloader**: zum Herunterladen eines Textes von einer Web-URL
- **Text zu Bild**: erzeugt ein Bild nach einer Eingabeaufforderung und nutzt dabei stabile Diffusion
- **Bildtransformation**: verändert ein Bild anhand eines Ausgangsbildes und einer Eingabeaufforderung, unter Ausnutzung der stabilen pix2pix-Diffusion
- **Text zu Video**: Erzeugen eines kleinen Videos nach einer Eingabeaufforderung, unter Verwendung von damo-vilab
-
-Das Text-zu-Bild-Tool, das wir von Anfang an verwendet haben, ist ein Remote-Tool, das sich in 
-[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! Wir werden
-weiterhin solche Tools für diese und andere Organisationen veröffentlichen, um diese Implementierung weiter zu verbessern.
-
-Die Agenten haben standardmäßig Zugriff auf die Tools, die sich auf [*huggingface-tools*](https://huggingface.co/huggingface-tools) befinden.
-Wie Sie Ihre eigenen Tools schreiben und freigeben können und wie Sie jedes benutzerdefinierte Tool, das sich auf dem Hub befindet, nutzen können, erklären wir in [folgender Anleitung](custom_tools).
-
-### Code-Erzeugung
-
-Bisher haben wir gezeigt, wie Sie die Agenten nutzen können, um Aktionen für Sie durchzuführen. Der Agent generiert jedoch nur Code
-den wir dann mit einem sehr eingeschränkten Python-Interpreter ausführen. Falls Sie den generierten Code in einer anderen Umgebung verwenden möchten 
-einer anderen Umgebung verwenden möchten, können Sie den Agenten auffordern, den Code zusammen mit einer Tooldefinition und genauen Importen zurückzugeben.
-
-Zum Beispiel die folgende Anweisung
-```python
-agent.run("Draw me a picture of rivers and lakes", return_code=True)
-```
-
-gibt den folgenden Code zurück
-
-```python
-from transformers import load_tool
-
-image_generator = load_tool("huggingface-tools/text-to-image")
-
-image = image_generator(prompt="rivers and lakes")
-```
-
-die Sie dann selbst ändern und ausführen können.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -21,6 +21,8 @@
      title: Adding a new model to Transformers
    - local: modular_transformers
      title: Modular Transformers
+    - local: auto_docstring
+      title: Document your models
    - local: task_summary
      title: What 🤗 Transformers can do
    - local: tasks_explained
@ -37,6 +39,8 @@
      title: Tokenizers
    - local: image_processors
      title: Image processors
+    - local: video_processors
+      title: Video processors
    - local: backbones
      title: Backbones
    - local: feature_extractors
@ -149,6 +153,8 @@
      title: TPU
    - local: perf_train_special
      title: Apple Silicon
+    - local: perf_train_gaudi
+      title: Intel Gaudi
    - local: perf_hardware
      title: Build your own machine
    title: Hardware
@ -161,8 +167,14 @@
  sections:
  - local: quantization/overview
    title: Overview
+  - local: quantization/selecting
+    title: Selecting a quantization method
+  - local: quantization/concept_guide
+    title: Quantization concepts
  - local: quantization/aqlm
    title: AQLM
+  - local: quantization/auto_round
+    title: AutoRound
  - local: quantization/awq
    title: AWQ
  - local: quantization/bitnet
@ -279,6 +291,8 @@
        title: Image-text-to-text
      - local: tasks/video_text_to_text
        title: Video-text-to-text
+      - local: tasks/visual_document_retrieval
+        title: Visual Document Retrieval
      title: Multimodal
    title: Task recipes
  - local: run_scripts
@ -306,8 +320,6 @@
 - isExpanded: false
  sections:
  - sections:
-    - local: main_classes/agent
-      title: Agents and Tools
    - local: model_doc/auto
      title: Auto Classes
    - local: main_classes/backbones
@ -352,7 +364,9 @@
      title: Feature Extractor
    - local: main_classes/image_processor
      title: Image Processor
-    title: Main classes
+    - local: main_classes/video_processor
+      title: Video Processor
+    title: Main Classes
  - sections:
    - sections:
      - local: model_doc/albert
@ -379,6 +393,8 @@
        title: BigBirdPegasus
      - local: model_doc/biogpt
        title: BioGpt
+      - local: model_doc/bitnet
+        title: BitNet
      - local: model_doc/blenderbot
        title: Blenderbot
      - local: model_doc/blenderbot-small
@ -461,6 +477,8 @@
        title: Gemma2
      - local: model_doc/glm
        title: GLM
+      - local: model_doc/glm4
+        title: glm4
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
@ -483,14 +501,16 @@
        title: Granite
      - local: model_doc/granitemoe
        title: GraniteMoe
+      - local: model_doc/granitemoehybrid
+        title: GraniteMoeHybrid
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
-      - local: model_doc/granitevision
-        title: GraniteVision
      - local: model_doc/helium
        title: Helium
      - local: model_doc/herbert
        title: HerBERT
+      - local: model_doc/hgnet_v2
+        title: HGNet-V2
      - local: model_doc/ibert
        title: I-BERT
      - local: model_doc/jamba
@ -507,8 +527,6 @@
        title: Llama2
      - local: model_doc/llama3
        title: Llama3
-      - local: model_doc/llama4
-        title: Llama4
      - local: model_doc/longformer
        title: Longformer
      - local: model_doc/longt5
@ -537,8 +555,6 @@
        title: MegatronGPT2
      - local: model_doc/mistral
        title: Mistral
-      - local: model_doc/mistral3
-        title: Mistral3
      - local: model_doc/mixtral
        title: Mixtral
      - local: model_doc/mluke
@ -589,8 +605,6 @@
        title: Phi
      - local: model_doc/phi3
        title: Phi-3
-      - local: model_doc/phi4_multimodal
-        title: Phi4 Multimodal
      - local: model_doc/phimoe
        title: PhiMoE
      - local: model_doc/phobert
@ -689,6 +703,8 @@
        title: ConvNeXTV2
      - local: model_doc/cvt
        title: CvT
+      - local: model_doc/d_fine
+        title: D-FINE
      - local: model_doc/dab-detr
        title: DAB-DETR
      - local: model_doc/deformable_detr
@ -735,6 +751,8 @@
        title: Mask2Former
      - local: model_doc/maskformer
        title: MaskFormer
+      - local: model_doc/mlcd
+        title: MLCD
      - local: model_doc/mobilenet_v1
        title: MobileNetV1
      - local: model_doc/mobilenet_v2
@ -813,12 +831,16 @@
        title: Bark
      - local: model_doc/clap
        title: CLAP
+      - local: model_doc/csm
+        title: CSM
      - local: model_doc/dac
        title: dac
      - local: model_doc/encodec
        title: EnCodec
      - local: model_doc/fastspeech2_conformer
        title: FastSpeech2Conformer
+      - local: model_doc/granite_speech
+        title: GraniteSpeech
      - local: model_doc/hubert
        title: Hubert
      - local: model_doc/mctct
@ -929,6 +951,8 @@
        title: GIT
      - local: model_doc/got_ocr2
        title: GOT-OCR2
+      - local: model_doc/granitevision
+        title: GraniteVision
      - local: model_doc/grounding-dino
        title: Grounding DINO
      - local: model_doc/groupvit
@ -943,6 +967,10 @@
        title: InstructBLIP
      - local: model_doc/instructblipvideo
        title: InstructBlipVideo
+      - local: model_doc/internvl
+        title: InternVL
+      - local: model_doc/janus
+        title: Janus
      - local: model_doc/kosmos-2
        title: KOSMOS-2
      - local: model_doc/layoutlm
@ -955,6 +983,8 @@
        title: LayoutXLM
      - local: model_doc/lilt
        title: LiLT
+      - local: model_doc/llama4
+        title: Llama4
      - local: model_doc/llava
        title: Llava
      - local: model_doc/llava_next
@ -969,6 +999,8 @@
        title: MatCha
      - local: model_doc/mgp-str
        title: MGP-STR
+      - local: model_doc/mistral3
+        title: Mistral3
      - local: model_doc/mllama
        title: mllama
      - local: model_doc/nougat
@ -985,10 +1017,14 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
+      - local: model_doc/phi4_multimodal
+        title: Phi4 Multimodal
      - local: model_doc/pix2struct
        title: Pix2Struct
      - local: model_doc/pixtral
        title: Pixtral
+      - local: model_doc/qwen2_5_omni
+        title: Qwen2.5-Omni
      - local: model_doc/qwen2_5_vl
        title: Qwen2.5-VL
      - local: model_doc/qwen2_audio
@ -997,6 +1033,8 @@
        title: Qwen2VL
      - local: model_doc/sam
        title: Segment Anything
+      - local: model_doc/sam_hq
+        title: Segment Anything High Quality
      - local: model_doc/shieldgemma2
        title: ShieldGemma2
      - local: model_doc/siglip
@ -1049,6 +1087,8 @@
        title: PatchTST
      - local: model_doc/time_series_transformer
        title: Time Series Transformer
+      - local: model_doc/timesfm
+        title: TimesFM
      title: Time series models
    - sections:
      - local: model_doc/graphormer
@ -1074,6 +1114,8 @@
      title: Utilities for Audio processing
    - local: internal/file_utils
      title: General Utilities
+    - local: internal/import_utils
+      title: Importing Utilities
    - local: internal/time_series_utils
      title: Utilities for Time Series
    title: Internal helpers
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -161,7 +161,7 @@ The downside is that if you aren't used to them, it may take some time to get us
 Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 ## Create a pull request
@ -292,7 +292,7 @@ Once you're able to run the original checkpoint, you're ready to start adapting

 ## Adapt the model code

-The `transformers-cli add-new-model-like` command should have generated a model and configuration file.
+The `transformers add-new-model-like` command should have generated a model and configuration file.

 - `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
 - `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
@ -551,10 +551,10 @@ While this example doesn't include an image processor, you may need to implement

 If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.

-Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
+Run the following command (only if you haven't already created the fast image processor with the `transformers add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.

 ```bash
-transformers-cli add-fast-image-processor --model-name your_model_name
+transformers add-fast-image-processor --model-name your_model_name
 ```

 This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -15,283 +15,4 @@ rendered properly in your Markdown viewer.
 -->

 > [!WARNING]
-> Agents and tools are being spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. These docs will be deprecated in the future!
-
-# Agents
-
-[[open-in-colab]]
-
-An agent is a system where a large language model (LLM) can execute more complex tasks through *planning* and using *tools*.
-
- Planning helps a LLM reason its way through a task by breaking it down into smaller subtasks. For example, [`CodeAgent`] plans a series of actions to take and then generates Python code to execute all the actions at once.
-
-    Another planning method is by self-reflection and refinement of its previous actions to improve its performance. The [`ReactJsonAgent`] is an example of this type of planning, and it's based on the [ReAct](https://hf.co/papers/2210.03629) framework. This agent plans and executes actions one at a time based on the feedback it receives from each action.
-
- Tools give a LLM access to external functions or APIs that it can use to help it complete a task. For example, [gradio-tools](https://github.com/freddyaboulton/gradio-tools) gives a LLM access to any of the [Gradio](https://www.gradio.app/) apps available on Hugging Face [Spaces](https://hf.co/spaces). These apps can be used for a wide range of tasks such as image generation, video generation, audio transcription, and more.
-
-To use agents in Transformers, make sure you have the extra `agents` dependencies installed.
-
-```bash
-!pip install transformers[agents]
-```
-
-Create an agent instance (refer to the [Agents](./main_classes/agent#agents) API for supported agents in Transformers) and a list of tools available for it to use, then [`~ReactAgent.run`] the agent on your task. The example below demonstrates how a ReAct agent reasons through a task.
-
-```py
-from transformers import ReactCodeAgent
-
-agent = ReactCodeAgent(tools=[])
-agent.run(
-    "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
-)
-```
-
-```bash
-======== New task ========
-How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
-==== Agent is executing the code below:
-bert_layers = 12  # BERT base encoder has 12 layers
-attention_layers = 6  # Encoder in Attention is All You Need has 6 layers
-layer_diff = bert_layers - attention_layers
-print("The difference in layers between BERT base encoder and Attention is All You Need is", layer_diff)
-====
-Print outputs:
-The difference in layers between BERT base encoder and Attention is All You Need is 6
-
-==== Agent is executing the code below:
-final_answer("BERT base encoder has {} more layers than the encoder from Attention is All You Need.".format(layer_diff))
-====
-Print outputs:
-
->>> Final answer:
-BERT base encoder has 6 more layers than the encoder from Attention is All You Need.
-```
-
-This guide will walk you through in more detail how to initialize an agent.
-
-## LLM
-
-An agent uses a LLM to plan and execute a task; it is the engine that powers the agent. To choose and build your own LLM engine, you need a method that:
-
-1. the input uses the [chat template](./chat_templating) format, `List[Dict[str, str]]`, and it returns a string
-2. the LLM stops generating outputs when it encounters the sequences in `stop_sequences`
-
-```py
-def llm_engine(messages, stop_sequences=["Task"]) -> str:
-    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
-    answer = response.choices[0].message.content
-    return answer
-```
-
-Next, initialize an engine to load a model. To run an agent locally, create a [`TransformersEngine`] to load a preinitialized [`Pipeline`].
-
-However, you could also leverage Hugging Face's powerful inference infrastructure, [Inference API](https://hf.co/docs/api-inference/index) or [Inference Endpoints](https://hf.co/docs/inference-endpoints/index), to run your model. This is useful for loading larger models that are typically required for agentic behavior. In this case, load the [`HfApiEngine`] to run the agent.
-
-The agent requires a list of tools it can use to complete a task. If you aren't using any additional tools, pass an empty list. The default tools provided by Transformers are loaded automatically, but you can optionally set `add_base_tools=True` to explicitly enable them.
-
-<hfoptions id="engine">
-<hfoption id="TransformersEngine">
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine, CodeAgent
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct").to("cuda")
-pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
-llm_engine = TransformersEngine(pipeline)
-agent = CodeAgent(tools=[], llm_engine=llm_engine)
-agent.run(
-    "What causes bread to rise?",
-)
-```
-
-</hfoption>
-<hfoption id="HfApiEngine">
-
-```py
-from transformers import CodeAgent, HfApiEngine
-
-llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
-agent = CodeAgent(tools=[], llm_engine=llm_engine)
-agent.run(
-    "Could you translate this sentence from French, say it out loud and return the audio.",
-    sentence="Où est la boulangerie la plus proche?",
-)
-```
-
-</hfoption>
-</hfoptions>
-
-The agent supports [constrained generation](https://hf.co/docs/text-generation-inference/conceptual/guidance) for generating outputs according to a specific structure with the `grammar` parameter. The `grammar` parameter should be specified in the `llm_engine` method or you can set it when initializing an agent.
-
-Lastly, an agent accepts additional inputs such as text and audio. In the [`HfApiEngine`] example above, the agent accepted a sentence to translate. But you could also pass a path to a local or remote file for the agent to access. The example below demonstrates how to pass a path to an audio file.
-
-```py
-from transformers import ReactCodeAgent
-
-agent = ReactCodeAgent(tools=[], llm_engine=llm_engine)
-agent.run("Why doesn't he know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
-```
-
-## System prompt
-
-A system prompt describes how an agent should behave, a description of the available tools, and the expected output format.
-
-Tools are defined by the `<<tool_descriptions>>` token which is dynamically replaced during runtime with the actual tool. The tool description is derived from the tool name, description, inputs, output type, and a Jinja2 template. Refer to the [Tools](./tools) guide for more information about how to describe tools.
-
-The example below is the system prompt for [`ReactCodeAgent`].
-
-```py
-You will be given a task to solve as best you can.
-You have access to the following tools:
-<<tool_descriptions>>
-
-To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
-
-At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
-Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
-During each intermediate step, you can use 'print()' to save whatever important information you will then need.
-These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
-
-In the end you have to return a final answer using the `final_answer` tool.
-
-Here are a few examples using notional tools:
---
-{examples}
-
-Above example were using notional tools that might not exist for you. You only have access to those tools:
-<<tool_names>>
-You also can perform computations in the python code you generate.
-
-Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
-
-Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
-Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
-
-Remember to make sure that variables you use are all defined.
-
-Now Begin!
-```
-
-The system prompt can be tailored to the intended task. For example, you can add a better explanation of the output format or you can overwrite the system prompt template entirely with your own custom system prompt as shown below.
-
-> [!WARNING]
-> If you're writing a custom system prompt, make sure to include `<<tool_descriptions>>` in the template so the agent is aware of the available tools.
-
-```py
-from transformers import ReactJsonAgent
-from transformers.agents import PythonInterpreterTool
-
-agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
-```
-
-## Code execution
-
-For safety, only the tools you provide (and the default Transformers tools) and the `print` function are executed. The interpreter doesn't allow importing modules that aren't on a safe list.
-
-To import modules that aren't on the list, add them as a list to the `additional_authorized_imports` parameter when initializing an agent.
-
-```py
-from transformers import ReactCodeAgent
-
-agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
-agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
-```
-
-Code execution stops if a tool isn't on the safe list, it isn't authorized, or if the code generated by the agent returns a Python error.
-
-> [!WARNING]
-> A LLM can generate any arbitrary code that can be executed, so don't add any unsafe imports!
-
-## Multi-agent
-
-[Multi-agent](https://hf.co/papers/2308.08155) refers to multiple agents working together to solve a task. Performance is typically better because each agent is specialized for a particular subtask.
-
-Multi-agents are created through a [`ManagedAgent`] class, where a *manager agent* oversees how other agents work together. The manager agent requires an agent and their name and description. These are added to the manager agents system prompt which lets it know how to call and use them.
-
-The multi-agent example below creates a web search agent that is managed by another [`ReactCodeAgent`].
-
-```py
-from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
-
-llm_engine = HfApiEngine()
-web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
-managed_web_agent = ManagedAgent(
-    agent=web_agent,
-    name="web_search",
-    description="Runs web searches for you. Give it your query as an argument."
-)
-manager_agent = ReactCodeAgent(
-    tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
-)
-manager_agent.run("Who is the CEO of Hugging Face?")
-```
-
-## Gradio integration
-
-[Gradio](https://www.gradio.app/) is a library for quickly creating and sharing machine learning apps. The [gradio.Chatbot](https://www.gradio.app/docs/gradio/chatbot) supports chatting with a Transformers agent with the [`stream_to_gradio`] function.
-
-Load a tool and LLM with an agent, and then create a Gradio app. The key is to use [`stream_to_gradio`] to stream the agents messages and display how it's reasoning through a task.
-
-```py
-import gradio as gr
-from transformers import (
-    load_tool,
-    ReactCodeAgent,
-    HfApiEngine,
-    stream_to_gradio,
-)
-
-# Import tool from Hub
-image_generation_tool = load_tool("m-ric/text-to-image")
-llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct")
-
-# Initialize the agent with the image generation tool
-agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
-
-def interact_with_agent(task):
-    messages = []
-    messages.append(gr.ChatMessage(role="user", content=task))
-    yield messages
-    for msg in stream_to_gradio(agent, task):
-        messages.append(msg)
-        yield messages + [
-            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
-        ]
-    yield messages
-
-with gr.Blocks() as demo:
-    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
-    submit = gr.Button("Run illustrator agent!")
-    chatbot = gr.Chatbot(
-        label="Agent",
-        type="messages",
-        avatar_images=(
-            None,
-            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
-        ),
-    )
-    submit.click(interact_with_agent, [text_input], [chatbot])
-
-if __name__ == "__main__":
-    demo.launch()
-```
-
-## Troubleshoot
-
-For a better idea of what is happening when you call an agent, it is always a good idea to check the system prompt template first.
-
-```py
-print(agent.system_prompt_template)
-```
-
-If the agent is behaving unexpectedly, remember to explain the task you want to perform as clearly as possible. Every [`~Agent.run`] is different and minor variations in your system prompt may yield completely different results.
-
-To find out what happened after a run, check the following agent attributes.
-
- `agent.logs` stores the finegrained agent logs. At every step of the agents run, everything is stored in a dictionary and appended to `agent.logs`.
- `agent.write_inner_memory_from_logs` only stores a high-level overview of the agents run. For example, at each step, it stores the LLM output as a message and the tool call output as a separate message. Not every detail from a step is transcripted by `write_inner_memory_from_logs`.
-
-## Resources
-
-Learn more about ReAct agents in the [Open-source LLMs as LangChain Agents](https://hf.co/blog/open-source-llms-as-agents) blog post.
+> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -108,7 +108,7 @@ If in doubt about what args/kwargs a given model sends to the attention function
 ## Accessing current available implementations

 Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
-and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
 would expect from a usual Python dictionary:

 ```python
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -0,0 +1,279 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilizing the @auto_docstring Decorator
+
+The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
+
+---
+
+## 📜 How it Works
+
+The `@auto_docstring` decorator constructs docstrings by:
+
+1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
+2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
+3.  **Overriding or Adding Arguments Descriptions:**
+    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
+    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+4.  **Adding Classes and Functions Introduction:**
+    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
+    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
+5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
+6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
+7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
+8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
+
+
+---
+
+## 🚀 How to Use @auto_docstring
+
+### 1. Importing the Decorator
+Import the decorator into your modeling file:
+
+```python
+from ...utils import auto_docstring
+```
+
+### 2. Applying to Classes
+Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
+
+```python
+from transformers.modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+
+@auto_docstring
+class MyAwesomeModel(PreTrainedModel):
+    def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
+        r"""
+        custom_parameter (`int`, *optional*, defaults to 10):
+            Description of the custom_parameter for MyAwesomeModel.
+        another_custom_arg (`str`, *optional*, defaults to "default"):
+            Documentation for another unique argument.
+        """
+        super().__init__(config)
+        self.custom_parameter = custom_parameter
+        self.another_custom_arg = another_custom_arg
+        # ... rest of your init
+
+    # ... other methods
+```
+
+#### Advanced Class Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+    custom_args="""
+    custom_parameter (`type`, *optional*, defaults to `default_value`):
+        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+    """
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        # ...
+```
+
+Or:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        r"""
+        custom_parameter (`type`, *optional*, defaults to `default_value`):
+            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+        """
+        # ...
+```
+
+### 3. Applying to Functions (e.g., `forward` method)
+Apply the decorator above method definitions, such as the `forward` method.
+
+```python
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        new_custom_argument: Optional[torch.Tensor] = None,
+        arg_documented_in_args_doc: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
+        r"""
+        new_custom_argument (`torch.Tensor`, *optional*):
+            Description of this new custom argument and its expected shape or type.
+        """
+        # ...
+```
+
+#### Advanced Function Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
+
+```python
+MODEL_COMMON_CUSTOM_ARGS = r"""
+    common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_1
+    common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_2
+    ...
+"""
+
+class MyModel(PreTrainedModel):
+    # ...
+    @auto_docstring(
+        custom_intro="""
+        This is a custom introduction for the function.
+        """
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        common_arg_1: Optional[torch.Tensor] = None,
+        common_arg_2: Optional[torch.Tensor] = None,
+        #...
+        function_specific_argument: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> torch.Tensor:
+        r"""
+        function_specific_argument (`torch.Tensor`, *optional*):
+            Description of an argument specific to this function
+
+        Returns:
+            `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
+
+        Example:
+
+        (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
+
+        ```python
+        ...
+        ```
+        """
+        # ...
+```
+
+---
+
+### ✍️ Documenting Arguments: Approach & Priority
+
+1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
+    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
+
+2.  **New or Custom Arguments:**
+    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
+    * **Format:**
+        ```
+        argument_name (`type`, *optional*, defaults to `X`):
+            Description of the argument.
+            Explain its purpose, expected shape/type if complex, and default behavior.
+            This can span multiple lines.
+        ```
+    * Include `type` in backticks.
+    * Add "*optional*" if the argument is not required (has a default value).
+    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
+
+3.  **Overriding Standard Arguments:**
+    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
+    * The `labels` argument is often customized per model and typically requires a specific docstring.
+
+4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
+    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+
+---
+
+### Usage with [modular files](./modular_transformers)
+
+When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
+
+- **For standalone models in modular files:**
+  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
+
+- **For models inheriting from other library models:**
+  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
+  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
+
+  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
+
+
+**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
+
+---
+
+## ✅ Checking Your Docstrings with `check_auto_docstrings`
+
+The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
+
+#### What it Checks:
+
+* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
+* **Argument Completeness & Consistency:**
+    * Flags arguments in the signature that are not known standard arguments and lack a local description.
+    * Ensures documented arguments exist in the signature. (TODO)
+    * Verifies that types and default values in the docstring match the signature. (TODO)
+* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
+* **Formatting:** Adherence to the expected docstring style.
+
+#### Running the Check Locally:
+
+Run this check locally before committing. The common command is:
+
+```bash
+make fix-copies
+```
+
+Alternatively, to only perform docstrings and auto-docstring checks, you can use:
+
+```bash
+python utils/check_docstrings.py # to only check files included in the diff without fixing them
+# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
+# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
+```
+
+#### Workflow with the Checker:
+
+1.  Add `@auto_docstring(...)` to the class or method.
+2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
+3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
+    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
+4.  Manually edit these placeholders with accurate types and descriptions.
+5.  Re-run the check to ensure all issues are resolved.
+
+---
+
+## 🔑 Key Takeaways & Best Practices
+
+* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
+* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
+* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
+* Document new or custom arguments clearly.
+* Run `check_docstrings` locally and iteratively.
+
+By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -181,35 +181,6 @@ processed_chat = processor.apply_chat_template(
 print(processed_chat.keys())
 ```

-</hfoption>
-<hfoption id="custom frame sampling">
-
-Some models don't sample frames *uniformly* and require more complex logic to determine which frames to use. For example, the model may have an *adaptive frame selection* or if the model prioritizes *key moments* in a video rather than evenly spaced frames.
-
-If a model has a different sampling strategy, you can write a function that customizes frame selection. The function should include the following requirements.
-
- Use the `sample_indices_fn` parameter to pass a callable function for sampling.
- If provided, this function *overrides* the standard `num_frames` and `fps` parameters.
- The function receives all the parameters passed to `load_video` and must return valid frame indices to sample from.
-
-An example function is shown below. This gives you full control over frame selection, making the model more adaptable to different video scenarios.
-
-```py
-def sample_indices_fn(metadata, **kwargs):
-    # samples only the first and the second frame
-    return [0, 1]
-
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    sample_indices_fn=sample_indices_fn,
-    video_load_backend="decord",
-)
-print(processed_chat.keys())
-```
-
 </hfoption>
 <hfoption id="list of image frames">

--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,22 +25,28 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers-cli
+## transformers CLI

-Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.
+After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
-transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+transformers chat Qwen/Qwen2.5-0.5B-Instruct
 ```

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
 </div>

+You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
+
+```bash
+transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
+```
+
 For a full list of options, run the command below.

 ```bash
-transformers-cli chat -h
+transformers chat -h
 ```

 The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
@ -76,16 +82,16 @@ print(response[0]["generated_text"][-1]["content"])
 (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
 some wild stuff, like that Warhol guy's soup cans and all that jazz.

-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@ -107,9 +113,9 @@ print(response[0]["generated_text"][-1]["content"])
 ```

 ```txt
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
 (sarcastically) Oh, yeah, real original, Andy.

 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -20,18 +20,22 @@ A decoding strategy informs how a model should select the next generated token.

 This guide will help you understand the different decoding strategies available in Transformers and how and when to use them.

-## Greedy search
+## Basic decoding methods

-Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 tokens.
+These are well established decoding methods, and should be your starting point for text generation tasks.

-Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself.
+### Greedy search
+
+Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 new tokens.
+
+Greedy search works well for tasks with relatively short outputs where creativity is not a priority. However, it breaks down when generating longer sequences because it begins to repeat itself.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("I look forward to", return_tensors="pt").to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
 # explicitly set to default length because Llama2 generation length is 4096
@ -40,11 +44,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing'
 ```

-## Contrastive search
+### Sampling

-[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
+Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire model's vocabulary (as opposed to the most likely token, as in greedy search). This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.

-Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
+Enable multinomial sampling with `do_sample=True` and `num_beams=1`.

 ```py
 import torch
@ -55,14 +59,14 @@ inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt"

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
 # explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
+'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
 ```

-## Beam search
+### Beam search

-Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability.
+Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability. It is best suited for input-grounded tasks, like describing an image or speech recognition. You can also use `do_sample=True` with beam search to sample at each step, but beam search will still greedily prune out low probability sequences between steps.

 > [!TIP]
 > Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works.
@ -83,66 +87,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 "['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']"
 ```

-## Diverse beam search
+## Advanced decoding methods

-[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
+Advanced decoding methods aim at either tackling specific generation quality issues (e.g. repetition) or at improving the generation throughput in certain situations. These techniques are more complex, and may not work correctly with all models.

-Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
-```
-
-## Multinomial sampling
-
-Search methods selects the most likely tokens. Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire models vocabulary. This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.
-
-Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
-```
-
-## Beam search multinomial sampling
-
-This decoding strategy is a combination of beam search and multinomial sampling. It generates multiple beams and uses a sampling strategy for each beam.
-
-Enable beam search multinomial sampling by setting `num_beams` to a value greater than 1 and `do_sample=True`.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=4)
-'Hugging Face is an open-source company 100% dedicated to making AI more accessible. We believe that AI should be available to everyone, and we’re working hard to make that a reality.\nWe’re a team of passionate engineers, designers,'
-```
-
-## Speculative decoding
+### Speculative decoding

 [Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more.

@ -203,7 +152,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 </hfoption>
 </hfoptions>

-### Prompt lookup decoding
+#### Prompt lookup decoding

 [Prompt lookup decoding](./llm_optims#prompt-lookup-decoding) is a variant of speculative decoding that uses overlapping n-grams as the candidate tokens. It works well for input-grounded tasks such as summarization. Refer to the [prompt lookup decoding](./llm_optims#prompt-lookup-decoding) guide to learn more.

@ -245,7 +194,7 @@ outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ```

-### Universal assisted decoding
+#### Universal assisted decoding

 Universal assisted decoding (UAD) enables the main and assistant models to use different tokenizers. The main models input tokens are re-encoded into assistant model tokens. Candidate tokens are generated in the assistant encoding which are re-encoded into the main model candidate tokens. The candidate tokens are verified as explained in [speculative decoding](#speculative-decoding).

@ -269,7 +218,27 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

-## DoLa
+### Contrastive search
+
+[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
+
+Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
+```
+
+### DoLa

 [Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit differences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2.

@ -325,6 +294,210 @@ tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tok
 </hfoption>
 </hfoptions>

+### Diverse beam search
+
+[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
+
+Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
+```
+
+
+## Custom decoding methods
+
+Custom decoding methods enable specialized generation behavior such as the following:
+- have the model continue thinking if it is uncertain;
+- roll back generation if the model gets stuck;
+- handle special tokens with custom logic;
+- enhanced input preparation for advanced models;
+
+We enable custom decoding methods through model repositories, assuming a specific model tag and file structure (see subsection below). This feature is an extension of [custom modeling code](./models.md#custom-models) and, like such, requires setting `trust_remote_code=True`.
+
+If a model repository holds a custom decoding method, the easiest way to try it out is to load the model and generate with it:
+
+<!-- TODO before merging: 1) better repo name (use a `generate-community` org?) 2) prettify the repo -->
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# `transformers-community/custom_generate_example` holds a copy of `Qwen/Qwen2.5-0.5B-Instruct`, but
+# with custom generation code -> calling `generate` uses the custom decoding method!
+tokenizer = AutoTokenizer.from_pretrained("transformers-community/custom_generate_example")
+model = AutoModelForCausalLM.from_pretrained(
+    "transformers-community/custom_generate_example", device_map="auto", trust_remote_code=True
+)
+
+inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
+# The custom decoding method is a minimal greedy decoding implementation. It also prints a custom message at run time.
+gen_out = model.generate(**inputs)
+# you should now see its custom message, "✨ using a custom generation method ✨"
+print(tokenizer.batch_decode(gen_out, skip_special_tokens=True))
+'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
+```
+
+Model repositories with custom decoding methods have a special property: their decoding method can be loaded from **any** model through [`~GenerationMixin.generate`]'s `custom_generate` argument. This means anyone can create and share their custom generation method to potentially work with any Transformers model, without requiring users to install additional Python packages.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
+
+inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
+# `custom_generate` replaces the original `generate` by the custom decoding method defined in
+# `transformers-community/custom_generate_example`
+gen_out = model.generate(**inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True)
+print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
+```
+
+You should read the `README.md` file of the repository containing the custom generation strategy to see what the new arguments and output type differences are, if they exist. Otherwise, you can assume it works like the base [`~GenerationMixin.generate`] method.
+
+> [!TIP]
+> You can find all custom decoding methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`
+
+Consider the Hub repository [transformers-community/custom_generate_example](https://huggingface.co/transformers-community/custom_generate_example) as an example. The `README.md` states that it has an additional input argument, `left_padding`, which adds a number of padding tokens before the prompt.
+
+```py
+gen_out = model.generate(
+    **inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True, left_padding=5
+)
+print(tokenizer.batch_decode(gen_out)[0])
+'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>The quick brown fox jumps over the lazy dog.\n\nThe sentence "The quick'
+```
+
+If the custom method has pinned Python requirements that your environment doesn't meet, you'll get an exception about missing requirements. For instance, [transformers-community/custom_generate_bad_requirements](https://huggingface.co/transformers-community/custom_generate_bad_requirements) has an impossible set of requirements defined in its `custom_generate/requirements.txt` file, and you'll see the error message below if you try to run it.
+
+```
+ImportError: Missing requirements in your local environment for `transformers-community/custom_generate_bad_requirements`:
+foo (installed: None)
+bar==0.0.0 (installed: None)
+torch>=99.0 (installed: 2.6.0)
+```
+
+Updating your Python requirements accordingly will remove this error message.
+
+### Creating a custom decoding method
+
+To create a new decoding method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
+1. The model you've designed your decoding method with.
+2. `custom_generate/generate.py`, which contains all the logic for your custom decoding method.
+3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method.
+4. `README.md`, where you should add the `custom_generate` tag and document any new arguments or output type differences of your custom method here.
+
+After you've added all required files, your repository should look like this
+
+```
+your_repo/
+├── README.md          # include the 'custom_generate' tag
+├── config.json
+├── ...
+└── custom_generate/
+    ├── generate.py
+    └── requirements.txt
+```
+
+#### Adding the base model
+
+The starting point for your custom decoding method is a model repository just like any other. The model to add to this repository should be the model you've designed your method with, and it is meant to be part of a working self-contained model-generate pair. When the model in this repository is loaded, your custom decoding method will override `generate`. Don't worry -- your decoding method can still be loaded with any other Transformers model, as explained in the section above.
+
+If you simply want to copy an existing model, you can do
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("source/model_repo")
+model = AutoModelForCausalLM.from_pretrained("source/model_repo")
+tokenizer.save_pretrained("your/decoding_method", push_to_hub=True)
+model.save_pretrained("your/decoding_method", push_to_hub=True)
+```
+
+#### generate.py
+
+This is the core of your decoding method. It *must* contain a method named `generate`, and this method *must* contain a `model` argument as its first argument. `model` is the model instance, which means you have access to all attributes and methods in the model, including the ones defined in [`GenerationMixin`] (like the base `generate` method).
+
+> [!WARNING]
+> `generate.py` must be placed in a folder named `custom_generate`, and not at the root level of the repository. The file paths for this feature are hardcoded.
+
+Under the hood, when the base [`~GenerationMixin.generate`] method is called with a `custom_generate` argument, it first checks its Python requirements (if any), then locates the custom `generate` method in `generate.py`, and finally calls the custom `generate`. All received arguments and `model` are forwarded to your custom `generate` method.
+
+This means your `generate` can have a mix of original and custom arguments (as well as a different output type) as shown below.
+
+```py
+import torch
+
+def generate(model, input_ids, generation_config=None, left_padding=None, **kwargs):
+    generation_config = generation_config or model.generation_config  # default to the model generation config
+    cur_length = input_ids.shape[1]
+    max_length = generation_config.max_length or cur_length + generation_config.max_new_tokens
+
+    # Example of custom argument: add `left_padding` (integer) pad tokens before the prompt
+    if left_padding is not None:
+        if not isinstance(left_padding, int) or left_padding < 0:
+            raise ValueError(f"left_padding must be an integer larger than 0, but is {left_padding}")
+
+        pad_token = kwargs.pop("pad_token", None) or generation_config.pad_token_id or model.config.pad_token_id
+        if pad_token is None:
+            raise ValueError("pad_token is not defined")
+        batch_size = input_ids.shape[0]
+        pad_tensor = torch.full(size=(batch_size, left_padding), fill_value=pad_token).to(input_ids.device)
+        input_ids = torch.cat((pad_tensor, input_ids), dim=1)
+        cur_length = input_ids.shape[1]
+
+    # Simple greedy decoding loop
+    while cur_length < max_length:
+        logits = model(input_ids).logits
+        next_token_logits = logits[:, -1, :]
+        next_tokens = torch.argmax(next_token_logits, dim=-1)
+        input_ids = torch.cat((input_ids, next_tokens[:, None]), dim=-1)
+        cur_length += 1
+
+    return input_ids
+```
+
+Follow the recommended practices below to ensure your custom decoding method works as expected.
+- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
+- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
+- You can add other files in the `custom_generate` folder, and use relative imports.
+- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
+
+#### requirements.txt
+
+You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
+
+#### README.md
+
+The root level `README.md` in the model repository usually describes the model therein. However, since the focus of the repository is the custom decoding method, we highly recommend to shift its focus towards describing the custom decoding method. In addition to a description of the method, we recommend documenting any input and/or output differences to the original [`~GenerationMixin.generate`]. This way, users can focus on what's new, and rely on Transformers docs for generic implementation details.
+
+For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository!
+
+```
+---
+library_name: transformers
+tags:
+  - custom_generate
+---
+
+(your markdown content here)
+```
+
+Recommended practices:
+- Document input and output differences in [`~GenerationMixin.generate`].
+- Add self-contained examples to enable quick experimentation.
+- Describe soft-requirements such as if the method only works well with a certain family of models.
+
+
 ## Resources

 Read the [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) blog post for an explanation of how common decoding strategies work.
--- a/docs/source/en/how_to_hack_models.md
+++ b/docs/source/en/how_to_hack_models.md
@ -90,11 +90,6 @@ class SamVisionAttentionSplit(SamVisionAttention, nn.Module):

        attn_weights = (query * self.scale) @ key.transpose(-2, -1)

-        if self.use_rel_pos:
-            attn_weights = self.add_decomposed_rel_pos(
-                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
-            )
-
        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
@ -114,13 +109,14 @@ Load the model with [`~PreTrainedModel.from_pretrained`].

 ```py
 from transformers import SamModel
-from transformers.models.sam import modeling_sam
-
-# replace the attention class in the modeling_sam module
-modeling_sam.SamVisionAttention = SamVisionAttentionSplit

 # load the pretrained SAM model
 model = SamModel.from_pretrained("facebook/sam-vit-base")
+
+# replace the attention class in the vision_encoder module
+for layer in model.vision_encoder.layers:
+    if hasattr(layer, "attn"):
+        layer.attn = SamVisionAttentionSplit(model.config.vision_config, model.config.vision_config.window_size)
 ```

 ## LoRA
@ -138,7 +134,7 @@ config = LoraConfig(
    # apply LoRA to q and v
    target_modules=["q", "v"],
    lora_dropout=0.1,
-    task_type="mask-generation"
+    task_type="FEATURE_EXTRACTION"
 )
 ```

@ -152,5 +148,5 @@ Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_refer

 ```py
 model.print_trainable_parameters()
-"trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447"
+"trainable params: 589,824 || all params: 94,274,096 || trainable%: 0.6256"
 ```
--- a/docs/source/en/image_processors.md
+++ b/docs/source/en/image_processors.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Image processors

-Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
+Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.

 - [`~BaseImageProcessor.center_crop`] to resize an image
 - [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.

 # Installation

-Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.9+, PyTorch 2.0+, TensorFlow 2.6+, and Flax 0.4.1+.
+Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.9+, PyTorch 2.1+, TensorFlow 2.6+, and Flax 0.4.1+.

 ## Virtual environment

--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@ -0,0 +1,104 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Import Utilities
+
+This page goes through the transformers utilities to enable lazy and fast object import.
+While we strive for minimal dependencies, some models have specific dependencies requirements that cannot be
+worked around. We don't want for all users of `transformers` to have to install those dependencies to use other models,
+we therefore mark those as soft dependencies rather than hard dependencies.
+
+The transformers toolkit is not made to error-out on import of a model that has a specific dependency; instead, an
+object for which you are lacking a dependency will error-out when calling any method on it. As an example, if 
+`torchvision` isn't installed, the fast image processors will not be available. 
+
+This object is still importable:
+
+```python
+>>> from transformers import DetrImageProcessorFast
+>>> print(DetrImageProcessorFast)
+<class 'DetrImageProcessorFast'>
+```
+
+However, no method can be called on that object:
+
+```python
+>>> DetrImageProcessorFast.from_pretrained()
+ImportError: 
+DetrImageProcessorFast requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+```
+
+Let's see how to specify specific object dependencies.
+
+## Specifying Object Dependencies
+
+### Filename-based
+
+All objects under a given filename have an automatic dependency to the tool linked to the filename
+
+**TensorFlow**: All files starting with `modeling_tf_` have an automatic TensorFlow dependency.
+
+**Flax**: All files starting with `modeling_flax_` have an automatic Flax dependency
+
+**PyTorch**: All files starting with `modeling_` and not valid with the above (TensorFlow and Flax) have an automatic 
+PyTorch dependency
+
+**Tokenizers**: All files starting with `tokenization_` and ending with `_fast` have an automatic `tokenizers` dependency
+
+**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group; 
+at the time of writing, this only contains the `pillow` dependency.
+
+**Vision + Torch + Torchvision**: All files starting with `image_processing_` and ending with `_fast` have an automatic
+dependency to `vision`, `torch`, and `torchvision`.
+
+All of these automatic dependencies are added on top of the explicit dependencies that are detailed below.
+
+### Explicit Object Dependencies
+
+We add a method called `requires` that is used to explicitly specify the dependencies of a given object. As an
+example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these 
+required dependencies:
+
+```python
+from .utils.import_utils import requires
+
+@requires(backends=("torch", "accelerate"))
+class Trainer:
+    ...
+```
+
+Backends that can be added here are all the backends that are available in the `import_utils.py` module.
+
+Additionally, specific versions can be specified in each backend. For example, this is how you would specify
+a requirement on torch>=2.6 on the `Trainer` class:
+
+```python
+from .utils.import_utils import requires
+
+@requires(backends=("torch>=2.6", "accelerate"))
+class Trainer:
+    ...
+```
+
+You can specify the following operators: `==`, `>`, `>=`, `<`, `<=`, `!=`.
+
+## Methods
+
+[[autodoc]] utils.import_utils.define_import_structure
+
+[[autodoc]] utils.import_utils.requires
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -28,7 +28,7 @@ Most of those are only useful if you are adding new models in the library.

 This context manager is a power user tool intended for model adders.
 It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
-To note, this context manager enforces `torch.inference_mode()`.
+To note, this context manager enforces `torch.no_grad()`.

 ### Rationale

@ -43,6 +43,7 @@ import torch
 from PIL import Image
 import requests
 from transformers import LlavaProcessor, LlavaForConditionalGeneration
+from transformers.model_debugging_utils import model_addition_debugger_context
 torch.random.manual_seed(673)

 # load pretrained model and processor
@ -60,12 +61,153 @@ prompt = "<image>Describe this image."
 inputs = processor(text=prompt, images=random_image, return_tensors="pt")

 # call forward method (not .generate!)
-with model_addition_debugger_context(model, "optional_path_to_your_output_file.json"):
+with model_addition_debugger_context(
+  model,
+  debug_path="optional_path_to_your_directory",
+  do_prune_layers=False # This will output ALL the layers of a model.
+  ):
    output = model.forward(**inputs)

 ```


-[[autodoc]] model_addition_debugger
+### Reading results
+
+The debugger generates two files from the forward call, both with the same base name, 
+but ending either with `_SUMMARY.json` or with `_FULL_TENSORS.json`. 
+
+The first one will contain a summary of each module's _input_ and _output_ tensor values and shapes.
+
+```json
+{
+  "module_path": "MolmoForConditionalGeneration",
+  "inputs": {
+    "args": [],
+    "kwargs": {
+      "input_ids": {
+        "shape": "torch.Size([1, 589])",
+        "dtype": "torch.int64"
+      },
+      "attention_mask": {
+        "shape": "torch.Size([1, 589])",
+        "dtype": "torch.int64"
+      },
+      "pixel_values": {
+        "shape": "torch.Size([1, 5, 576, 588])",
+        "dtype": "torch.float32",
+        "mean": "tensor(-8.9514e-01, device='cuda:0')",
+        "std": "tensor(9.2586e-01, device='cuda:0')",
+        "min": "tensor(-1.7923e+00, device='cuda:0')",
+        "max": "tensor(1.8899e+00, device='cuda:0')"
+    }
+  },
+  "children": [
+    {
+      "module_path": "MolmoForConditionalGeneration.language_model.model.embed_tokens",
+      "inputs": {
+        "args": [
+          {
+            "shape": "torch.Size([1, 589])",
+            "dtype": "torch.int64"
+          }
+        ]
+      },
+      "outputs": {
+        "shape": "torch.Size([1, 589, 3584])",
+        "dtype": "torch.float32",
+        "mean": "tensor(6.5460e-06, device='cuda:0')",
+        "std": "tensor(2.3807e-02, device='cuda:0')",
+        "min": "tensor(-3.3398e-01, device='cuda:0')",
+        "max": "tensor(3.9453e-01, device='cuda:0')"
+      }
+    },
+    {
+      "module_path": "MolmoForConditionalGeneration.vision_tower",
+      "inputs": {
+        "args": [
+          {
+            "shape": "torch.Size([5, 1, 576, 588])",
+            "dtype": "torch.float32",
+            "mean": "tensor(-8.9514e-01, device='cuda:0')",
+            "std": "tensor(9.2586e-01, device='cuda:0')",
+            "min": "tensor(-1.7923e+00, device='cuda:0')",
+            "max": "tensor(1.8899e+00, device='cuda:0')"
+          }
+        ],
+        "kwargs": {
+          "output_hidden_states": "True"
+        }
+      },
+      "children": [
+        { ... and so on
+```
+
+The `_FULL_TENSORS.json` file will display a full view of all tensors, which is useful
+for comparing two files. 
+```json
+      "pixel_values": {
+        "shape": "torch.Size([1, 5, 576, 588])",
+        "dtype": "torch.float32",
+        "value": [
+          "tensor([[[[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          ...,",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]],",
+          "",
+          "         [[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          ...,",
+          "          [-1.4857e+00, -1.4820e+00, -1.2100e+00,  ..., -6.0979e-01, -5.9650e-01, -3.8527e-01],",
+          "          [-1.6755e+00, -1.7221e+00, -1.4518e+00,  ..., -7.5577e-01, -7.4658e-01, -5.5592e-01],",
+          "          [-7.9957e-01, -8.2162e-01, -5.7014e-01,  ..., -1.3689e+00, -1.3169e+00, -1.0678e+00]],",
+          "",
+          "         [[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          ...,",
+          "          [-3.0322e-01, -5.0645e-01, -5.8436e-01,  ..., -6.2439e-01, -7.9160e-01, -8.1188e-01],",
+          "          [-4.4921e-01, -6.5653e-01, -7.2656e-01,  ..., -3.4702e-01, -5.2146e-01, -5.1326e-01],",
+          "          [-3.4702e-01, -5.3647e-01, -5.4170e-01,  ..., -1.0915e+00, -1.1968e+00, -1.0252e+00]],",
+          "",
+          "         [[-1.1207e+00, -1.2718e+00, -1.0678e+00,  ..., 1.2013e-01, -1.3126e-01, -1.7197e-01],",
+          "          [-6.9738e-01, -9.1166e-01, -8.5454e-01,  ..., -5.5050e-02, -2.8134e-01, -4.2793e-01],",
+          "          [-3.4702e-01, -5.5148e-01, -5.8436e-01,  ..., 1.9312e-01, -8.6235e-02, -2.1463e-01],",
+          "          ...,",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]],",
+          "",
+          "         [[-1.0039e+00, -9.5669e-01, -6.5546e-01,  ..., -1.4711e+00, -1.4219e+00, -1.1389e+00],",
+          "          [-1.0039e+00, -9.5669e-01, -6.5546e-01,  ..., -1.7193e+00, -1.6771e+00, -1.4091e+00],",
+          "          [-1.6317e+00, -1.6020e+00, -1.2669e+00,  ..., -1.2667e+00, -1.2268e+00, -8.9720e-01],",
+          "          ...,",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
+          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]]]], device='cuda:0')"
+        ],
+        "mean": "tensor(-8.9514e-01, device='cuda:0')",
+        "std": "tensor(9.2586e-01, device='cuda:0')",
+        "min": "tensor(-1.7923e+00, device='cuda:0')",
+        "max": "tensor(1.8899e+00, device='cuda:0')"
+      },
+```
+
+### Comparing between implementations
+
+Once the forward passes of two models have been traced by the debugger, one can compare the `json` output files. See below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong. 
+
+
+![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/files_difference_debugging.png)
+
+
+### Limitations and scope
+
+This feature will only work for torch-based models, and would require more work and case-by-case approach for say `jax`-based models that are usually compiled. Models relying heavily on external kernel calls may work, but trace will probably miss some things. Regardless, any python implementation that aims at mimicking another implementation can be traced once instead of reran N times with breakpoints.
+
+If you pass `do_prune_layers=False` to your model debugger, ALL the layers will be outputted to `json`. Else, only the first and last layer will be shown. This is useful when some layers (typically cross-attention) appear only after N layers. 

 [[autodoc]] model_addition_debugger_context
--- a/docs/source/en/internal/modeling_utils.md
+++ b/docs/source/en/internal/modeling_utils.md
@ -20,6 +20,10 @@ This page lists all the custom layers used by the library, as well as the utilit

 Most of those are only useful if you are studying the code of the models in the library.

+## Layers
+
+[[autodoc]] GradientCheckpointingLayer
+
 ## Attention Functions

 [[autodoc]] AttentionInterface
@ -33,23 +37,6 @@ Most of those are only useful if you are studying the code of the models in the

 [[autodoc]] pytorch_utils.Conv1D

-[[autodoc]] modeling_utils.PoolerStartLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerEndLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerAnswerClass
-    - forward
-
-[[autodoc]] modeling_utils.SquadHeadOutput
-
-[[autodoc]] modeling_utils.SQuADHead
-    - forward
-
-[[autodoc]] modeling_utils.SequenceSummary
-    - forward
-
 ## PyTorch Helper Functions

 [[autodoc]] pytorch_utils.apply_chunking_to_forward
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 The key-value (KV) vectors are used to calculate attention scores. For autoregressive models, KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time.

-A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation.md) doc for a more detailed explanation about how a cache works.
+A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation) doc for a more detailed explanation about how a cache works.

 Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case.

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -20,9 +20,13 @@ rendered properly in your Markdown viewer.

 Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token.

-In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities.
+In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.

-This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
+> [!TIP]
+> You can also chat with a model directly from the command line. ([reference](./conversations.md#transformers-cli))
+> ```shell
+> transformers chat Qwen/Qwen2.5-0.5B-Instruct
+> ```

 ## Default generate

@ -134,6 +138,20 @@ outputs = model.generate(**inputs, generation_config=generation_config)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ```

+## Common Options
+
+[`~GenerationMixin.generate`] is a powerful tool that can be heavily customized. This can be daunting for a new users. This section contains a list of popular generation options that you can define in most text generation tools in Transformers: [`~GenerationMixin.generate`], [`GenerationConfig`], `pipelines`, the `chat` CLI, ...
+
+| Option name | Type | Simplified description |
+|---|---|---|
+| `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
+| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
+| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
+| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
+| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
+| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
+
+
 ## Pitfalls

 The section below covers some common issues you may encounter during text generation and how to solve them.
@ -286,4 +304,4 @@ Take a look below for some more specific and specialized text generation librari
 - [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python).
 - [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs.
 - [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation.
- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
+- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@ -1,167 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Agents & Tools
-
-<Tip warning={true}>
-
-Transformers Agents is an experimental API which is subject to change at any time. Results returned by the agents
-can vary as the APIs or underlying models are prone to change.
-
-</Tip>
-
-To learn more about agents and tools make sure to read the [introductory guide](../transformers_agents). This page
-contains the API docs for the underlying classes.
-
-## Agents
-
-We provide two types of agents, based on the main [`Agent`] class:
- [`CodeAgent`] acts in one shot, generating code to solve the task, then executes it at once.
- [`ReactAgent`] acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes:
-  - [`ReactJsonAgent`] writes its tool calls in JSON.
-  - [`ReactCodeAgent`] writes its tool calls in Python code.
-
-### Agent
-
-[[autodoc]] Agent
-
-### CodeAgent
-
-[[autodoc]] CodeAgent
-
-### React agents
-
-[[autodoc]] ReactAgent
-
-[[autodoc]] ReactJsonAgent
-
-[[autodoc]] ReactCodeAgent
-
-### ManagedAgent
-
-[[autodoc]] ManagedAgent
-
-## Tools
-
-### load_tool
-
-[[autodoc]] load_tool
-
-### tool
-
-[[autodoc]] tool
-
-### Tool
-
-[[autodoc]] Tool
-
-### Toolbox
-
-[[autodoc]] Toolbox
-
-### PipelineTool
-
-[[autodoc]] PipelineTool
-
-### launch_gradio_demo
-
-[[autodoc]] launch_gradio_demo
-
-### stream_to_gradio
-
-[[autodoc]] stream_to_gradio
-
-### ToolCollection
-
-[[autodoc]] ToolCollection
-
-## Engines
-
-You're free to create and use your own engines to be usable by the Agents framework.
-These engines have the following specification:
-1. Follow the [messages format](../chat_templating.md) for its input (`List[Dict[str, str]]`) and return a string.
-2. Stop generating outputs *before* the sequences passed in the argument `stop_sequences`
-
-### TransformersEngine
-
-For convenience, we have added a `TransformersEngine` that implements the points above, taking a pre-initialized `Pipeline` as input.
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine
-
->>> model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
->>> model = AutoModelForCausalLM.from_pretrained(model_name)
-
->>> pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-
->>> engine = TransformersEngine(pipe)
->>> engine([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])
-
-"What a "
-```
-
-[[autodoc]] TransformersEngine
-
-### HfApiEngine
-
-The `HfApiEngine` is an engine that wraps an [HF Inference API](https://huggingface.co/docs/api-inference/index) client for the execution of the LLM.
-
-```python
->>> from transformers import HfApiEngine
-
->>> messages = [
-...   {"role": "user", "content": "Hello, how are you?"},
-...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
-...   {"role": "user", "content": "No need to help, take it easy."},
-... ]
-
->>> HfApiEngine()(messages, stop_sequences=["conversation"])
-
-"That's very kind of you to say! It's always nice to have a relaxed "
-```
-
-[[autodoc]] HfApiEngine
-
-
-## Agent Types
-
-Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return
-text, image, audio, video, among other types. In order to increase compatibility between tools, as well as to 
-correctly render these returns in ipython (jupyter, colab, ipython notebooks, ...), we implement wrapper classes
-around these types.
-
-The wrapped objects should continue behaving as initially; a text object should still behave as a string, an image
-object should still behave as a `PIL.Image`.
-
-These types have three specific purposes:
-
- Calling `to_raw` on the type should return the underlying object
- Calling `to_string` on the type should return the object as a string: that can be the string in case of an `AgentText`
-  but will be the path of the serialized version of the object in other instances
- Displaying it in an ipython kernel should display the object correctly
-
-### AgentText
-
-[[autodoc]] transformers.agents.agent_types.AgentText
-
-### AgentImage
-
-[[autodoc]] transformers.agents.agent_types.AgentImage
-
-### AgentAudio
-
-[[autodoc]] transformers.agents.agent_types.AgentAudio
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -77,9 +77,9 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] TorchAoConfig

-## BitNetConfig
+## BitNetQuantConfig

-[[autodoc]] BitNetConfig
+[[autodoc]] BitNetQuantConfig

 ## SpQRConfig

@ -92,3 +92,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## QuarkConfig

 [[autodoc]] QuarkConfig
+
+## AutoRoundConfig
+
+[[autodoc]] AutoRoundConfig
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -0,0 +1,55 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Video Processor
+
+A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
+
+The video processor extends the functionality of image processors by allowing Vision Large Language Models (VLMs) to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
+
+When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't upadted your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
+
+
+### Usage Example
+Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
+
+```python
+from transformers import AutoVideoProcessor
+
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+```
+
+Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
+
+Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
+
+```python
+import torch
+from transformers.video_utils import load_video
+from transformers import AutoVideoProcessor
+
+video = load_video("video.mp4")
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
+processor = torch.compile(processor)
+processed_video = processor(video, return_tensors="pt")
+```
+
+
+## BaseVideoProcessor
+
+[[autodoc]] video_processing_utils.BaseVideoProcessor
+
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@ -57,6 +57,7 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 - Embedding size E is different from hidden size H justified because the embeddings are context independent (one embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V being the vocab size). If E < H, it has less parameters.
 - Layers are split in groups that share parameters (to save memory).
 Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -102,6 +102,10 @@ response = processor.decode(output_ids, skip_special_tokens=True)

 [[autodoc]] AriaTextModel

+## AriaModel
+
+[[autodoc]] AriaModel
+
 ## AriaTextForCausalLM

 [[autodoc]] AriaTextForCausalLM
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -74,6 +74,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its

 [[autodoc]] AutoImageProcessor

+## AutoVideoProcessor
+
+[[autodoc]] AutoVideoProcessor
+
 ## AutoProcessor

 [[autodoc]] AutoProcessor
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -237,6 +237,10 @@ for i, output in enumerate(batch_outputs):

 [[autodoc]] AyaVisionConfig

+## AyaVisionModel
+
+[[autodoc]] AyaVisionModel
+
 ## AyaVisionForConditionalGeneration

 [[autodoc]] AyaVisionForConditionalGeneration
--- a/docs/source/en/model_doc/bamba.md
+++ b/docs/source/en/model_doc/bamba.md
@ -39,7 +39,7 @@ Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-mod
 <!---
 ## Usage Tips

-Tips: 
+Tips:

 - The architecture is based on Mamba-2 models.

@ -63,7 +63,35 @@ response = model.generate(**inputs, max_new_tokens=64)
 print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
 ```

+
+## Padding-Free Training
+
+Bamba supports padding-free training in which distinct training examples can be concatenated
+together while nevertheless processing the inputs as though they belonged to separate batches. When
+the examples are of varying lengths, padding-free training can provide significant speed ups and
+memory savings compared to batching the examples together and using padding, as the unnecessary
+compute and memory due to padding is avoided entirely. The performance gains depend on factors such
+as the model and the data distribution, but throughput gains up to [~2x are commonly
+seen](https://github.com/huggingface/transformers/pull/35861#issue-2807873129).
+
+Using padding-free training with Bamba requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d`
+packages, and the following arguments must be passed to the model in addition to `input_ids` and
+`labels`:
+* `position_ids: torch.LongTensor`: the position index of each token in each sequence.
+* `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
+* Each of the [`FlashAttentionKwargs`]
+    * `cu_seq_lens_q: torch.LongTensor`: The cumulative sequence lengths of all queries.
+    * `cu_seq_lens_k: torch.LongTensor`: The cumulative sequence lengths of all keys.
+    * `max_length_q: int`: the longest query length in the batch.
+    * `max_length_k: int`: the longest key length in the batch.
+
+The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] can be used
+to programmatically generate the above set of additional arguments using `return_seq_idx=True` and
+`return_flash_attn_kwargs=True`. See [this blog post](https://huggingface.co/blog/packing-with-FA2)
+for additional information.
+
+
 [[autodoc]] BambaForCausalLM
    - forward

-This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim). 
+This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@ -55,6 +55,7 @@ This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The
  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
  * permute sentences
  * rotate the document to make it start at a specific token
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ## Implementation Notes

--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -151,6 +151,12 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

+## BeitImageProcessorFast
+
+[[autodoc]] BeitImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@ -81,10 +81,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model google-bert/bert-base-uncased --device 0
 ```

 </hfoption>
@ -256,4 +256,4 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran

 [[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput

-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@ -36,6 +36,7 @@ This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The
 - BioGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left.
 - BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
 - The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/bit.md
+++ b/docs/source/en/model_doc/bit.md
@ -58,6 +58,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] BitImageProcessor
    - preprocess

+## BitImageProcessorFast
+
+[[autodoc]] BitImageProcessorFast
+    - preprocess
+
 ## BitModel

 [[autodoc]] BitModel
--- a/docs/source/en/model_doc/bitnet.md
+++ b/docs/source/en/model_doc/bitnet.md
@ -0,0 +1,121 @@
+<!--Copyright 2025 The BitNet Team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BitNet
+
+## Overview
+
+Trained on a corpus of 4 trillion tokens, this model demonstrates that native 1-bit LLMs can achieve performance comparable to leading open-weight, full-precision models of similar size, while offering substantial advantages in computational efficiency (memory, energy, latency).
+
+➡️ **Technical Report:** [BitNet b1.58 2B4T Technical Report](https://arxiv.org/abs/2504.12285)
+
+➡️ **Official Inference Code:** [microsoft/BitNet (bitnet.cpp)](https://github.com/microsoft/BitNet)
+
+## Model Variants
+
+Several versions of the model weights are available on Hugging Face:
+
+* [**`microsoft/bitnet-b1.58-2B-4T`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T): Contains the packed 1.58-bit weights optimized for efficient inference. **Use this for deployment.**
+
+* [**`microsoft/bitnet-b1.58-2B-4T-bf16`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-bf16): Contains the master weights in BF16 format. **Use this only for training or fine-tuning purposes.**
+
+* [**`microsoft/bitnet-b1.58-2B-4T-gguf`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-gguf): Contains the model weights in GGUF format, compatible with the `bitnet.cpp` library for CPU inference.
+
+
+### Model Details
+
+
+* **Architecture:** Transformer-based, modified with `BitLinear` layers (BitNet framework).
+    * Uses Rotary Position Embeddings (RoPE).
+    * Uses squared ReLU (ReLU²) activation in FFN layers.
+    * Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
+    * No bias terms in linear or normalization layers.
+* **Quantization:** Native 1.58-bit weights and 8-bit activations (W1.58A8).
+    * Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
+    * Activations are quantized to 8-bit integers using absmax quantization (per-token).
+    * **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
+* **Parameters:** ~2 Billion
+* **Training Tokens:** 4 Trillion
+*   **Context Length:** Maximum sequence length of **4096 tokens**.
+    *   *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
+* **Training Stages:**
+    1.  **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
+    2.  **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
+    3.  **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
+* **Tokenizer:** LLaMA 3 Tokenizer (vocab size: 128,256).
+
+
+## Usage tips
+
+
+**VERY IMPORTANT NOTE ON EFFICIENCY**
+
+> Please do NOT expect performance efficiency gains (in terms of speed, latency, or energy consumption) when using this model with the standard transformers library.
+>
+> The current execution paths within transformers do not contain the specialized, highly optimized computational kernels required to leverage the advantages of the BitNet architecture. Running the model via transformers will likely result in inference speeds and energy usage comparable to, or potentially worse than, standard full-precision models within this framework on both CPU and GPU.
+>
+> While you might observe reduced memory usage due to the quantized weights, the primary computational efficiency benefits are not accessible through this standard transformers usage path.
+>
+> For achieving the efficiency benefits demonstrated in the technical paper, you MUST use the dedicated C++ implementation: [bitnet.cpp](https://github.com/microsoft/BitNet).
+
+### Requirements
+
+```bash
+pip install transformers
+```
+
+### Example
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "microsoft/bitnet-b1.58-2B-4T"
+
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16
+)
+
+# Apply the chat template
+messages = [
+    {"role": "system", "content": "You are a helpful AI assistant."},
+    {"role": "user", "content": "How are you?"},
+]
+chat_input = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+
+# Generate response
+chat_outputs = model.generate(chat_input, max_new_tokens=50)
+response = tokenizer.decode(chat_outputs[0][chat_input.shape[-1]:], skip_special_tokens=True) # Decode only the response part
+print("\nAssistant Response:", response)
+```
+
+
+## BitNetConfig
+
+[[autodoc]] BitNetConfig
+
+## BitNetModel
+
+[[autodoc]] BitNetModel
+    - forward
+
+## BitNetForCausalLM
+
+[[autodoc]] BitNetForCausalLM
+    - forward
--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@ -88,6 +88,11 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] BlipTextModel
    - forward

+## BlipTextLMHeadModel
+
+[[autodoc]] BlipTextLMHeadModel
+- forward
+
 ## BlipVisionModel

 [[autodoc]] BlipVisionModel
@ -123,6 +128,11 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] TFBlipTextModel
    - call

+## TFBlipTextLMHeadModel
+
+[[autodoc]] TFBlipTextLMHeadModel
+- forward
+
 ## TFBlipVisionModel

 [[autodoc]] TFBlipVisionModel
--- a/docs/source/en/model_doc/bridgetower.md
+++ b/docs/source/en/model_doc/bridgetower.md
@ -147,6 +147,11 @@ Tips:
 [[autodoc]] BridgeTowerImageProcessor
    - preprocess

+## BridgeTowerImageProcessorFast
+
+[[autodoc]] BridgeTowerImageProcessorFast
+    - preprocess
+
 ## BridgeTowerProcessor

 [[autodoc]] BridgeTowerProcessor
--- a/docs/source/en/model_doc/chinese_clip.md
+++ b/docs/source/en/model_doc/chinese_clip.md
@ -90,6 +90,11 @@ Currently, following scales of pretrained Chinese-CLIP models are available on
 [[autodoc]] ChineseCLIPImageProcessor
    - preprocess

+## ChineseCLIPImageProcessorFast
+
+[[autodoc]] ChineseCLIPImageProcessorFast
+    - preprocess
+
 ## ChineseCLIPFeatureExtractor

 [[autodoc]] ChineseCLIPFeatureExtractor
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -35,7 +35,7 @@ The example below demonstrates how to generate code with [`Pipeline`], or the [`

 <hfoptions id="usage">
 <hfoption id="Pipeline">
-    
+
 ```py
 import torch
 from transformers import pipeline
@ -76,7 +76,7 @@ prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
 input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

 output = model.generate(
-    **input_ids, 
+    **input_ids,
    max_new_tokens=256,
    cache_implementation="static"
 )
@ -92,10 +92,10 @@ print(filled_text)
 ```

 </hfoption>
-<hfoption id="transformers-cli">
-    
+<hfoption id="transformers CLI">
+
 ```bash
-echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers-cli run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
 ```

 </hfoption>
@ -146,7 +146,7 @@ visualizer("""def func(a, b):
 - Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
    ```py
    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
-    
+
    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
    PROMPT = '''def remove_non_ascii(s: str) -> str:
@ -155,7 +155,7 @@ visualizer("""def func(a, b):
    '''
    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
    generated_ids = model.generate(input_ids, max_new_tokens=128)
-    
+
    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
    print(PROMPT.replace("<FILL_ME>", filling))
    ```
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -49,9 +49,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids, 
-    max_new_tokens=100, 
-    do_sample=True, 
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
    temperature=0.3,
    cache_implementation="static",
 )
@ -59,11 +59,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat --model_name_or_path CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
+transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -85,9 +85,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids, 
-    max_new_tokens=100, 
-    do_sample=True, 
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
    temperature=0.3,
    cache_implementation="static",
 )
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@ -1,5 +1,4 @@
 <!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

@ -9,76 +8,134 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
-
 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
 # ColPali

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[ColPali](https://huggingface.co/papers/2407.01449) is a model designed to retrieve documents by analyzing their visual features. Unlike traditional systems that rely heavily on text extraction and OCR, ColPali treats each page as an image. It uses [Paligemma-3B](./paligemma) to capture not only text, but also the layout, tables, charts, and other visual elements to create detailed embeddings. This offers a more comprehensive understanding of documents and enables more efficient and accurate retrieval.

-## Overview
+You can find all the original ColPali checkpoints under the [ColPali](https://huggingface.co/collections/vidore/hf-native-colvision-models-6755d68fc60a8553acaa96f7) collection.

-The *ColPali* model was proposed in [ColPali: Efficient Document Retrieval with Vision Language Models](https://doi.org/10.48550/arXiv.2407.01449) by **Manuel Faysse***, **Hugues Sibille***, **Tony Wu***, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (* denotes equal contribution). Work lead by ILLUIN Technology.
+> [!TIP]
+> Click on the ColPali models in the right sidebar for more examples of how to use ColPali for image retrieval.

-In our proposed *ColPali* approach, we leverage VLMs to construct efficient multi-vector embeddings directly from document images (“screenshots”) for document retrieval. We train the model to maximize the similarity between these document embeddings and the corresponding query embeddings, using the late interaction method introduced in ColBERT.
+<hfoptions id="usage">
+<hfoption id="image retrieval">

-Using *ColPali* removes the need for potentially complex and brittle layout recognition and OCR pipelines with a single model that can take into account both the textual and visual content (layout, charts, etc.) of a document.
-
-## Resources
-
- The *ColPali* arXiv paper can be found [here](https://doi.org/10.48550/arXiv.2407.01449). 📄
- The official blog post detailing ColPali can be found [here](https://huggingface.co/blog/manu/colpali). 📝
- The original model implementation code for the ColPali model and for the `colpali-engine` package can be found [here](https://github.com/illuin-tech/colpali). 🌎
- Cookbooks for learning to use the transformers-native version of *ColPali*, fine-tuning, and similarity maps generation can be found [here](https://github.com/tonywu71/colpali-cookbooks). 📚
-
-This model was contributed by [@tonywu71](https://huggingface.co/tonywu71) and [@yonigozlan](https://huggingface.co/yonigozlan).
-
-## Usage
-
-This example demonstrates how to use *ColPali* to embed both queries and images, calculate their similarity scores, and identify the most relevant matches. For a specific query, you can retrieve the top-k most similar images by selecting the ones with the highest similarity scores.
-
-```python
+```py
+import requests
 import torch
 from PIL import Image
-
 from transformers import ColPaliForRetrieval, ColPaliProcessor

-model_name = "vidore/colpali-v1.2-hf"
-
+# Load model (bfloat16 support is limited; fallback to float32 if needed)
 model = ColPaliForRetrieval.from_pretrained(
-    model_name,
-    torch_dtype=torch.bfloat16,
-    device_map="cuda:0",  # or "mps" if on Apple Silicon
+    "vidore/colpali-v1.2-hf",
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto",  # "cpu", "cuda", or "mps" for Apple Silicon
 ).eval()

 processor = ColPaliProcessor.from_pretrained(model_name)

-# Your inputs (replace dummy images with screenshots of your documents)
+url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
+url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"
+
 images = [
-    Image.new("RGB", (32, 32), color="white"),
-    Image.new("RGB", (16, 16), color="black"),
+    Image.open(requests.get(url1, stream=True).raw),
+    Image.open(requests.get(url2, stream=True).raw),
 ]
+
 queries = [
-    "What is the organizational structure for our R&D department?",
-    "Can you provide a breakdown of last year’s financial performance?",
+    "Who printed the edition of Romeo and Juliet?",
+    "When was the United States Declaration of Independence proclaimed?",
 ]

 # Process the inputs
-batch_images = processor(images=images).to(model.device)
-batch_queries = processor(text=queries).to(model.device)
+inputs_images = processor(images=images, return_tensors="pt").to(model.device)
+inputs_text = processor(text=queries, return_tensors="pt").to(model.device)

 # Forward pass
 with torch.no_grad():
-    image_embeddings = model(**batch_images).embeddings
-    query_embeddings = model(**batch_queries).embeddings
+    image_embeddings = model(**inputs_images).embeddings
+    query_embeddings = model(**inputs_text).embeddings

-# Score the queries against the images
 scores = processor.score_retrieval(query_embeddings, image_embeddings)
+
+print("Retrieval scores (query x image):")
+print(scores)
 ```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.
+
+```py
+import requests
+import torch
+from PIL import Image
+from transformers import ColPaliForRetrieval, ColPaliProcessor
+from transformers import BitsAndBytesConfig
+
+# 4-bit quantization configuration
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+
+model_name = "vidore/colpali-v1.2-hf"
+
+# Load model 
+model = ColPaliForRetrieval.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map="cuda"
+).eval()
+
+processor = ColPaliProcessor.from_pretrained(model_name)
+
+url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
+url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"
+
+images = [
+    Image.open(requests.get(url1, stream=True).raw),
+    Image.open(requests.get(url2, stream=True).raw),
+]
+
+queries = [
+    "Who printed the edition of Romeo and Juliet?",
+    "When was the United States Declaration of Independence proclaimed?",
+]
+
+# Process the inputs
+inputs_images = processor(images=images, return_tensors="pt").to(model.device)
+inputs_text = processor(text=queries, return_tensors="pt").to(model.device)
+
+# Forward pass
+with torch.no_grad():
+    image_embeddings = model(**inputs_images).embeddings
+    query_embeddings = model(**inputs_text).embeddings
+
+scores = processor.score_retrieval(query_embeddings, image_embeddings)
+
+print("Retrieval scores (query x image):")
+print(scores)
+```
+
+## Notes
+
+- [`~ColPaliProcessor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.

 ## ColPaliConfig

--- a/docs/source/en/model_doc/conditional_detr.md
+++ b/docs/source/en/model_doc/conditional_detr.md
@ -48,6 +48,11 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o

 [[autodoc]] ConditionalDetrImageProcessor
    - preprocess
+
+## ConditionalDetrImageProcessorFast
+
+[[autodoc]] ConditionalDetrImageProcessorFast
+    - preprocess
    - post_process_object_detection
    - post_process_instance_segmentation
    - post_process_semantic_segmentation
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -0,0 +1,377 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Csm
+
+## Overview
+
+The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.
+
+**Model Architecture:**
+CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
+
+The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/csm_architecture.png"/>
+</div>
+
+## Usage Tips
+
+### Without Conversational Context
+
+CSM can be used to simply generate speech from a text prompt:
+
+```python
+import torch
+from transformers import CsmForConditionalGeneration, AutoProcessor
+
+model_id = "eustlb/csm-1b"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# prepare the inputs
+text = "[0]The past is just a story we tell ourselves." # `[0]` for speaker id 0
+inputs = processor(text, add_special_tokens=True).to(device)
+
+# another equivalent way to prepare the inputs
+conversation = [
+    {"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+# infer the model
+audio = model.generate(**inputs, output_audio=True)
+processor.save_audio(audio, "example_without_context.wav")
+```
+
+### With Conversational Context
+
+CSM can be used to generate speech given a conversation, allowing consistency in the voices and content-aware generation:
+
+```python
+import torch
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset, Audio
+
+model_id = "eustlb/csm-1b"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# prepare the inputs
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+# ensure the audio is 24kHz
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+conversation = []
+
+# 1. context
+for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
+    conversation.append(
+        {
+            "role": f"{speaker_id}",
+            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
+        }
+    )
+
+# 2. text prompt
+conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+# infer the model
+audio = model.generate(**inputs, output_audio=True)
+processor.save_audio(audio, "example_with_context.wav")
+```
+
+### Batched Inference
+
+CSM supports batched inference!
+
+```python
+import torch
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset, Audio
+
+model_id = "eustlb/csm-1b"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# prepare the inputs 
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+# ensure the audio is 24kHz
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+# here a batch with two prompts
+conversation = [
+    [
+        {
+            "role": f"{ds[0]['speaker_id']}",
+            "content": [
+                {"type": "text", "text": ds[0]["text"]},
+                {"type": "audio", "path": ds[0]["audio"]["array"]},
+            ],
+        },
+        {
+            "role": f"{ds[1]['speaker_id']}",
+            "content": [
+                {"type": "text", "text": ds[1]["text"]},
+            ],
+        },
+    ],
+    [
+        {
+            "role": f"{ds[0]['speaker_id']}",
+            "content": [
+                {"type": "text", "text": ds[0]["text"]},
+            ],
+        }
+    ],
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+audio = model.generate(**inputs, output_audio=True)
+processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
+```
+
+### Making The Model Go Brrr
+
+CSM supports full-graph compilation with CUDA graphs!
+
+```python
+import torch
+import copy
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset
+
+model_id = "eustlb/csm-1b"
+device = "cuda"
+
+# set logs to ensure no recompilation and graph breaks
+torch._logging.set_logs(graph_breaks=True, recompiles=True, cudagraphs=True)
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+
+# use static cache, enabling automatically torch compile with fullgraph and reduce-overhead
+model.generation_config.max_length = 250 # big enough to avoid recompilation
+model.generation_config.max_new_tokens = None # would take precedence over max_length
+model.generation_config.cache_implementation = "static"
+model.depth_decoder.generation_config.cache_implementation = "static"
+
+# generation kwargs
+gen_kwargs = {
+    "do_sample": False,
+    "depth_decoder_do_sample": False,
+    "temperature": 1.0,
+    "depth_decoder_temperature": 1.0,
+}
+
+# Define a timing decorator
+class TimerContext:
+    def __init__(self, name="Execution"):
+        self.name = name
+        self.start_event = None
+        self.end_event = None
+        
+    def __enter__(self):
+        # Use CUDA events for more accurate GPU timing
+        self.start_event = torch.cuda.Event(enable_timing=True)
+        self.end_event = torch.cuda.Event(enable_timing=True)
+        self.start_event.record()
+        return self
+
+    def __exit__(self, *args):
+        self.end_event.record()
+        torch.cuda.synchronize()
+        elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0
+        print(f"{self.name} time: {elapsed_time:.4f} seconds")
+
+# prepare the inputs 
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+
+conversation = [
+    {
+        "role": f"{ds[0]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[0]["text"]},
+            {"type": "audio", "path": ds[0]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[1]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[1]["text"]},
+            {"type": "audio", "path": ds[1]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[2]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[2]["text"]},
+        ],
+    },
+]
+
+padded_inputs_1 = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+print("\n" + "="*50)
+print("First generation - compiling and recording CUDA graphs...")
+with TimerContext("First generation"):
+    _ = model.generate(**padded_inputs_1, **gen_kwargs)
+print("="*50)
+
+print("\n" + "="*50)
+print("Second generation - fast !!!")
+with TimerContext("Second generation"):
+    _ = model.generate(**padded_inputs_1, **gen_kwargs)
+print("="*50)
+
+# now with different inputs
+conversation = [
+    {
+        "role": f"{ds[0]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[2]["text"]},
+            {"type": "audio", "path": ds[2]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[1]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[3]["text"]},
+            {"type": "audio", "path": ds[3]["audio"]["array"]},
+        ],
+    },
+    {
+        "role": f"{ds[2]['speaker_id']}",
+        "content": [
+            {"type": "text", "text": ds[4]["text"]},
+        ],
+    },
+]
+padded_inputs_2 = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+).to(device)
+
+print("\n" + "="*50)
+print("Generation with other inputs!")
+with TimerContext("Generation with different inputs"):
+    _ = model.generate(**padded_inputs_2, **gen_kwargs)
+print("="*50)
+```
+
+### Training
+
+CSM Transformers integration supports training!
+
+```python
+from transformers import CsmForConditionalGeneration, AutoProcessor
+from datasets import load_dataset, Audio
+
+model_id = "eustlb/csm-1b"
+device = "cuda"
+
+# load the model and the processor
+processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
+model.train()
+
+ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
+# ensure the audio is 24kHz
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+conversation = []
+
+# context
+for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
+    conversation.append(
+        {
+            "role": f"{speaker_id}",
+            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
+        }
+    )
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    return_dict=True,
+    output_labels=True,
+).to(device)
+
+out = model(**inputs)
+out.loss.backward()
+```
+
+This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
+The original code can be found [here](https://github.com/SesameAILabs/csm).
+
+
+## CsmConfig
+
+[[autodoc]] CsmConfig
+
+## CsmDepthDecoderConfig
+
+[[autodoc]] CsmDepthDecoderConfig
+
+## CsmProcessor
+
+[[autodoc]] CsmProcessor
+    - __call__
+
+## CsmForConditionalGeneration
+
+[[autodoc]] CsmForConditionalGeneration
+    - forward
+    - generate
+
+## CsmDepthDecoderForCausalLM
+
+[[autodoc]] CsmDepthDecoderForCausalLM
+
+## CsmDepthDecoderModel
+
+[[autodoc]] CsmDepthDecoderModel
+
+## CsmBackboneModel
+
+[[autodoc]] CsmBackboneModel
--- a/docs/source/en/model_doc/d_fine.md
+++ b/docs/source/en/model_doc/d_fine.md
@ -0,0 +1,76 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# D-FINE
+
+## Overview
+
+The D-FINE model was proposed in [D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement](https://arxiv.org/abs/2410.13842) by
+Yansong Peng, Hebei Li, Peixi Wu, Yueyi Zhang, Xiaoyan Sun, Feng Wu
+
+The abstract from the paper is the following:
+
+*We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). 
+FDR transforms the regression process from predicting fixed coordinates to iteratively refining probability distributions, providing a fine-grained intermediate representation that significantly enhances localization accuracy. GO-LSD is a bidirectional optimization strategy that transfers localization knowledge from refined distributions to shallower layers through self-distillation, while also simplifying the residual prediction tasks for deeper layers. Additionally, D-FINE incorporates lightweight optimizations in computationally intensive modules and operations, achieving a better balance between speed and accuracy. Specifically, D-FINE-L / X achieves 54.0% / 55.8% AP on the COCO dataset at 124 / 78 FPS on an NVIDIA T4 GPU. When pretrained on Objects365, D-FINE-L / X attains 57.1% / 59.3% AP, surpassing all existing real-time detectors. Furthermore, our method significantly enhances the performance of a wide range of DETR models by up to 5.3% AP with negligible extra parameters and training costs. Our code and pretrained models: this https URL.*
+
+This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber). 
+The original code can be found [here](https://github.com/Peterande/D-FINE).
+
+## Usage tips 
+
+```python
+>>> import torch
+>>> from transformers.image_utils import load_image
+>>> from transformers import DFineForObjectDetection, AutoImageProcessor
+
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = load_image(url)
+
+>>> image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine_x_coco")
+>>> model = DFineForObjectDetection.from_pretrained("ustc-community/dfine_x_coco")
+
+>>> inputs = image_processor(images=image, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> results = image_processor.post_process_object_detection(outputs, target_sizes=[(image.height, image.width)], threshold=0.5)
+
+>>> for result in results:
+...     for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
+...         score, label = score.item(), label_id.item()
+...         box = [round(i, 2) for i in box.tolist()]
+...         print(f"{model.config.id2label[label]}: {score:.2f} {box}")
+cat: 0.96 [344.49, 23.4, 639.84, 374.27]
+cat: 0.96 [11.71, 53.52, 316.64, 472.33]
+remote: 0.95 [40.46, 73.7, 175.62, 117.57]
+sofa: 0.92 [0.59, 1.88, 640.25, 474.74]
+remote: 0.89 [333.48, 77.04, 370.77, 187.3]
+```
+
+## DFineConfig
+
+[[autodoc]] DFineConfig
+
+## DFineModel
+
+[[autodoc]] DFineModel
+    - forward
+
+## DFineForObjectDetection
+
+[[autodoc]] DFineForObjectDetection
+    - forward
--- a/docs/source/en/model_doc/data2vec.md
+++ b/docs/source/en/model_doc/data2vec.md
@ -53,6 +53,7 @@ The original code for vision can be found [here](https://github.com/facebookrese
 - For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
 - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
 - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@ -10,71 +10,169 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# DINOv2
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview

-The DINOv2 model was proposed in [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by
-Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
-DINOv2 is an upgrade of [DINO](https://arxiv.org/abs/2104.14294), a self-supervised method applied on [Vision Transformers](vit). This method enables all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning.
+# DINOv2

-The abstract from the paper is the following:
+[DINOv2](https://huggingface.co/papers/2304.07193) is a vision foundation model that uses [ViT](./vit) as a feature extractor for multiple downstream tasks like image classification and depth estimation. It focuses on stabilizing and accelerating training through techniques like a faster memory-efficient attention, sequence packing, improved stochastic depth, Fully Sharded Data Parallel (FSDP), and model distillation.

-*The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.*
+You can find all the original DINOv2 checkpoints under the [Dinov2](https://huggingface.co/collections/facebook/dinov2-6526c98554b3d2576e071ce3) collection.

-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/facebookresearch/dinov2).
+> [!TIP]
+> Click on the DINOv2 models in the right sidebar for more examples of how to apply DINOv2 to different vision tasks.

-## Usage tips
+The example below demonstrates how to obtain an image embedding with [`Pipeline`] or the [`AutoModel`] class.

-The model can be traced using `torch.jit.trace` which leverages JIT compilation to optimize the model making it faster to run. Note this still produces some mis-matched elements and the difference between the original model and the traced model is of the order of 1e-4.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-```python
+```py
 import torch
-from transformers import AutoImageProcessor, AutoModel
-from PIL import Image
+from transformers import pipeline
+
+pipe = pipeline(
+    task="image-classification", 
+    model="facebook/dinov2-small-imagenet1k-1-layer",
+    torch_dtype=torch.float16,
+    device=0
+)
+
+pipe("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
 import requests
+from transformers import AutoImageProcessor, AutoModelForImageClassification
+from PIL import Image
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+processor = AutoImageProcessor.from_pretrained("facebook/dinov2-small-imagenet1k-1-layer")
+model = AutoModelForImageClassification.from_pretrained(
+    "facebook/dinov2-small-imagenet1k-1-layer", 
+    torch_dtype=torch.float16, 
+    device_map="auto", 
+    attn_implementation="sdpa"
+)
+
+inputs = processor(images=image, return_tensors="pt")
+logits = model(**inputs).logits
+predicted_class_idx = logits.argmax(-1).item()
+print("Predicted class:", model.config.id2label[predicted_class_idx])
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```py
+# pip install torchao
+import requests
+from transformers import TorchAoConfig, AutoImageProcessor, AutoModelForImageClassification
+from torchao.quantization import Int4WeightOnlyConfig
+from PIL import Image

 url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 image = Image.open(requests.get(url, stream=True).raw)

-processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
-model = AutoModel.from_pretrained('facebook/dinov2-base')
+processor = AutoImageProcessor.from_pretrained('facebook/dinov2-giant-imagenet1k-1-layer')
+
+quant_config = Int4WeightOnlyConfig(group_size=128)
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+model = AutoModelForImageClassification.from_pretrained(
+    'facebook/dinov2-giant-imagenet1k-1-layer',
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)

 inputs = processor(images=image, return_tensors="pt")
 outputs = model(**inputs)
-last_hidden_states = outputs[0]
-
-# We have to force return_dict=False for tracing
-model.config.return_dict = False
-
-with torch.no_grad():
-    traced_model = torch.jit.trace(model, [inputs.pixel_values])
-    traced_outputs = traced_model(inputs.pixel_values)
-
-print((last_hidden_states - traced_outputs[0]).abs().max())
+logits = outputs.logits
+predicted_class_idx = logits.argmax(-1).item()
+print("Predicted class:", model.config.id2label[predicted_class_idx])
 ```

-## Resources
+## Notes

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DINOv2.
+- The example below shows how to split the output tensor into:
+  - one embedding for the whole image, commonly referred to as a `CLS` token,
+    useful for classification and retrieval
+  - a set of local embeddings, one for each `14x14` patch of the input image,
+    useful for dense tasks, such as semantic segmentation

- Demo notebooks for DINOv2 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DINOv2). 🌎
+  ```py
+  from transformers import AutoImageProcessor, AutoModel
+  from PIL import Image
+  import requests
+  
+  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+  image = Image.open(requests.get(url, stream=True).raw)
+  print(image.height, image.width)  # [480, 640]
+  
+  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+  model = AutoModel.from_pretrained('facebook/dinov2-base')
+  patch_size = model.config.patch_size
+  
+  inputs = processor(images=image, return_tensors="pt")
+  print(inputs.pixel_values.shape)  # [1, 3, 224, 224]
+  batch_size, rgb, img_height, img_width = inputs.pixel_values.shape
+  num_patches_height, num_patches_width = img_height // patch_size, img_width // patch_size
+  num_patches_flat = num_patches_height * num_patches_width
+  
+  outputs = model(**inputs)
+  last_hidden_states = outputs[0]
+  print(last_hidden_states.shape)  # [1, 1 + 256, 768]
+  assert last_hidden_states.shape == (batch_size, 1 + num_patches_flat, model.config.hidden_size)
+  
+  cls_token = last_hidden_states[:, 0, :]
+  patch_features = last_hidden_states[:, 1:, :].unflatten(1, (num_patches_height, num_patches_width))
+  ```

-<PipelineTag pipeline="image-classification"/>
+- Use [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) to speedup inference.
+  However, it will produce some mismatched elements. The difference between the original and traced model is 1e-4.

- [`Dinov2ForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+  ```py
+  import torch
+  from transformers import AutoImageProcessor, AutoModel
+  from PIL import Image
+  import requests
+  
+  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+  image = Image.open(requests.get(url, stream=True).raw)
+  
+  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+  model = AutoModel.from_pretrained('facebook/dinov2-base')
+  
+  inputs = processor(images=image, return_tensors="pt")
+  outputs = model(**inputs)
+  last_hidden_states = outputs[0]
+  
+  # We have to force return_dict=False for tracing
+  model.config.return_dict = False
+  
+  with torch.no_grad():
+      traced_model = torch.jit.trace(model, [inputs.pixel_values])
+      traced_outputs = traced_model(inputs.pixel_values)
+  
+  print((last_hidden_states - traced_outputs[0]).abs().max())
+  ```

 ## Dinov2Config

--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@ -83,10 +83,10 @@ print(f"Predicted label: {predicted_label}")

 </hfoption>

-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
+echo -e "I love using Hugging Face Transformers!" | transformers run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
 ```

 </hfoption>
@ -213,7 +213,3 @@ echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task

 </jax>
 </frameworkcontent>
-
-
-
-
--- a/docs/source/en/model_doc/donut.md
+++ b/docs/source/en/model_doc/donut.md
@ -13,180 +13,191 @@ rendered properly in your Markdown viewer.

 specific language governing permissions and limitations under the License. -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
 # Donut

-## Overview
+[Donut (Document Understanding Transformer)](https://huggingface.co/papers2111.15664) is a visual document understanding model that doesn't require an Optical Character Recognition (OCR) engine. Unlike traditional approaches that extract text using OCR before processing, Donut employs an end-to-end Transformer-based architecture to directly analyze document images. This eliminates OCR-related inefficiencies making it more accurate and adaptable to diverse languages and formats. 

-The Donut model was proposed in [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by
-Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
-Donut consists of an image Transformer encoder and an autoregressive text Transformer decoder to perform document understanding
-tasks such as document image classification, form understanding and visual question answering.
+Donut features vision encoder ([Swin](./swin)) and a text decoder ([BART](./bart)). Swin converts document images into embeddings and BART processes them into meaningful text sequences.

-The abstract from the paper is the following:
+You can find all the original Donut checkpoints under the [Naver Clova Information Extraction](https://huggingface.co/naver-clova-ix) organization.

-*Understanding document images (e.g., invoices) is a core but challenging task since it requires complex functions such as reading text and a holistic understanding of the document. Current Visual Document Understanding (VDU) methods outsource the task of reading text to off-the-shelf Optical Character Recognition (OCR) engines and focus on the understanding task with the OCR outputs. Although such OCR-based approaches have shown promising performance, they suffer from 1) high computational costs for using OCR; 2) inflexibility of OCR models on languages or types of document; 3) OCR error propagation to the subsequent process. To address these issues, in this paper, we introduce a novel OCR-free VDU model named Donut, which stands for Document understanding transformer. As the first step in OCR-free VDU research, we propose a simple architecture (i.e., Transformer) with a pre-training objective (i.e., cross-entropy loss). Donut is conceptually simple yet effective. Through extensive experiments and analyses, we show a simple OCR-free VDU model, Donut, achieves state-of-the-art performances on various VDU tasks in terms of both speed and accuracy. In addition, we offer a synthetic data generator that helps the model pre-training to be flexible in various languages and domains.*
+> [!TIP]
+> Click on the Donut models in the right sidebar for more examples of how to apply Donut to different language and vision tasks.

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/donut_architecture.jpg"
-alt="drawing" width="600"/>
+The examples below demonstrate how to perform document understanding tasks using Donut with [`Pipeline`] and [`AutoModel`]

-<small> Donut high-level overview. Taken from the <a href="https://arxiv.org/abs/2111.15664">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
-[here](https://github.com/clovaai/donut).
-
-## Usage tips
-
- The quickest way to get started with Donut is by checking the [tutorial
-  notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut), which show how to use the model
-  at inference time as well as fine-tuning on custom data.
- Donut is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
-
-## Inference examples
-
-Donut's [`VisionEncoderDecoder`] model accepts images as input and makes use of
-[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
-
-The [`DonutImageProcessor`] class is responsible for preprocessing the input image and
-[`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`] decodes the generated target tokens to the target string. The
-[`DonutProcessor`] wraps [`DonutImageProcessor`] and [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]
-into a single instance to both extract the input features and decode the predicted token ids.
-
- Step-by-step Document Image Classification
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```py
->>> import re
+# pip install datasets
+import torch
+from transformers import pipeline
+from PIL import Image

->>> from transformers import DonutProcessor, VisionEncoderDecoderModel
->>> from datasets import load_dataset
->>> import torch
+pipeline = pipeline(
+    task="document-question-answering",
+    model="naver-clova-ix/donut-base-finetuned-docvqa",
+    device=0,
+    torch_dtype=torch.float16
+)
+dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+image = dataset[0]["image"]

->>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
->>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model.to(device)  # doctest: +IGNORE_RESULT
-
->>> # load document image
->>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
->>> image = dataset[1]["image"]
-
->>> # prepare decoder inputs
->>> task_prompt = "<s_rvlcdip>"
->>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
-
->>> pixel_values = processor(image, return_tensors="pt").pixel_values
-
->>> outputs = model.generate(
-...     pixel_values.to(device),
-...     decoder_input_ids=decoder_input_ids.to(device),
-...     max_length=model.decoder.config.max_position_embeddings,
-...     pad_token_id=processor.tokenizer.pad_token_id,
-...     eos_token_id=processor.tokenizer.eos_token_id,
-...     use_cache=True,
-...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
-...     return_dict_in_generate=True,
-... )
-
->>> sequence = processor.batch_decode(outputs.sequences)[0]
->>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
->>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(processor.token2json(sequence))
-{'class': 'advertisement'}
+pipeline(image=image, question="What time is the coffee break?")
 ```

- Step-by-step Document Parsing
+</hfoption>
+<hfoption id="AutoModel">

 ```py
->>> import re
+# pip install datasets
+import torch
+from datasets import load_dataset
+from transformers import AutoProcessor, AutoModelForVision2Seq

->>> from transformers import DonutProcessor, VisionEncoderDecoderModel
->>> from datasets import load_dataset
->>> import torch
+processor = AutoProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+model = AutoModelForVision2Seq.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

->>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
->>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+image = dataset[0]["image"]
+question = "What time is the coffee break?"
+task_prompt = f"<s_docvqa><s_question>{question}</s_question><s_answer>"
+inputs = processor(image, task_prompt, return_tensors="pt")

->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model.to(device)  # doctest: +IGNORE_RESULT
-
->>> # load document image
->>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
->>> image = dataset[2]["image"]
-
->>> # prepare decoder inputs
->>> task_prompt = "<s_cord-v2>"
->>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
-
->>> pixel_values = processor(image, return_tensors="pt").pixel_values
-
->>> outputs = model.generate(
-...     pixel_values.to(device),
-...     decoder_input_ids=decoder_input_ids.to(device),
-...     max_length=model.decoder.config.max_position_embeddings,
-...     pad_token_id=processor.tokenizer.pad_token_id,
-...     eos_token_id=processor.tokenizer.eos_token_id,
-...     use_cache=True,
-...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
-...     return_dict_in_generate=True,
-... )
-
->>> sequence = processor.batch_decode(outputs.sequences)[0]
->>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
->>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(processor.token2json(sequence))
-{'menu': {'nm': 'CINNAMON SUGAR', 'unitprice': '17,000', 'cnt': '1 x', 'price': '17,000'}, 'sub_total': {'subtotal_price': '17,000'}, 'total': {'total_price': '17,000', 'cashprice': '20,000', 'changeprice': '3,000'}}
+outputs = model.generate(
+    input_ids=inputs.input_ids,
+    pixel_values=inputs.pixel_values,
+    max_length=512
+)
+answer = processor.decode(outputs[0], skip_special_tokens=True)
+print(answer)
 ```

- Step-by-step Document Visual Question Answering (DocVQA)
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.

 ```py
->>> import re
+# pip install datasets torchao
+import torch
+from datasets import load_dataset
+from transformers import TorchAoConfig, AutoProcessor, AutoModelForVision2Seq

->>> from transformers import DonutProcessor, VisionEncoderDecoderModel
->>> from datasets import load_dataset
->>> import torch
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+processor = AutoProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+model = AutoModelForVision2Seq.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa", quantization_config=quantization_config)

->>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
->>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+image = dataset[0]["image"]
+question = "What time is the coffee break?"
+task_prompt = f"<s_docvqa><s_question>{question}</s_question><s_answer>"
+inputs = processor(image, task_prompt, return_tensors="pt")

->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model.to(device)  # doctest: +IGNORE_RESULT
-
->>> # load document image from the DocVQA dataset
->>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
->>> image = dataset[0]["image"]
-
->>> # prepare decoder inputs
->>> task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
->>> question = "When is the coffee break?"
->>> prompt = task_prompt.replace("{user_input}", question)
->>> decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
-
->>> pixel_values = processor(image, return_tensors="pt").pixel_values
-
->>> outputs = model.generate(
-...     pixel_values.to(device),
-...     decoder_input_ids=decoder_input_ids.to(device),
-...     max_length=model.decoder.config.max_position_embeddings,
-...     pad_token_id=processor.tokenizer.pad_token_id,
-...     eos_token_id=processor.tokenizer.eos_token_id,
-...     use_cache=True,
-...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
-...     return_dict_in_generate=True,
-... )
-
->>> sequence = processor.batch_decode(outputs.sequences)[0]
->>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
->>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
->>> print(processor.token2json(sequence))
-{'question': 'When is the coffee break?', 'answer': '11-14 to 11:39 a.m.'}
+outputs = model.generate(
+    input_ids=inputs.input_ids,
+    pixel_values=inputs.pixel_values,
+    max_length=512
+)
+answer = processor.decode(outputs[0], skip_special_tokens=True)
+print(answer)
 ```

-See the [model hub](https://huggingface.co/models?filter=donut) to look for Donut checkpoints.
+## Notes

-## Training
+- Use Donut for document image classification as shown below.

-We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Donut).
+    ```py
+    >>> import re
+    >>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+    >>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
+
+    >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+    >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+    >>> # load document image
+    >>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+    >>> image = dataset[1]["image"]
+
+    >>> # prepare decoder inputs
+    >>> task_prompt = "<s_rvlcdip>"
+    >>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+    >>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    >>> outputs = model.generate(
+    ...     pixel_values.to(device),
+    ...     decoder_input_ids=decoder_input_ids.to(device),
+    ...     max_length=model.decoder.config.max_position_embeddings,
+    ...     pad_token_id=processor.tokenizer.pad_token_id,
+    ...     eos_token_id=processor.tokenizer.eos_token_id,
+    ...     use_cache=True,
+    ...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+    ...     return_dict_in_generate=True,
+    ... )
+
+    >>> sequence = processor.batch_decode(outputs.sequences)[0]
+    >>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+    >>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+    >>> print(processor.token2json(sequence))
+    {'class': 'advertisement'}
+    ```
+
+- Use Donut for document parsing as shown below.
+
+    ```py
+    >>> import re
+    >>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+    >>> model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
+
+    >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+    >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+    >>> # load document image
+    >>> dataset = load_dataset("hf-internal-testing/example-documents", split="test")
+    >>> image = dataset[2]["image"]
+
+    >>> # prepare decoder inputs
+    >>> task_prompt = "<s_cord-v2>"
+    >>> decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+
+    >>> pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    >>> outputs = model.generate(
+    ...     pixel_values.to(device),
+    ...     decoder_input_ids=decoder_input_ids.to(device),
+    ...     max_length=model.decoder.config.max_position_embeddings,
+    ...     pad_token_id=processor.tokenizer.pad_token_id,
+    ...     eos_token_id=processor.tokenizer.eos_token_id,
+    ...     use_cache=True,
+    ...     bad_words_ids=[[processor.tokenizer.unk_token_id]],
+    ...     return_dict_in_generate=True,
+    ... )
+
+    >>> sequence = processor.batch_decode(outputs.sequences)[0]
+    >>> sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+    >>> sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+    >>> print(processor.token2json(sequence))
+    {'menu': {'nm': 'CINNAMON SUGAR', 'unitprice': '17,000', 'cnt': '1 x', 'price': '17,000'}, 'sub_total': {'subtotal_price': '17,000'}, 'total': 
+    {'total_price': '17,000', 'cashprice': '20,000', 'changeprice': '3,000'}}
+    ```

 ## DonutSwinConfig

@ -197,6 +208,11 @@ We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-
 [[autodoc]] DonutImageProcessor
    - preprocess

+## DonutImageProcessorFast
+
+[[autodoc]] DonutImageProcessorFast
+    - preprocess
+
 ## DonutFeatureExtractor

 [[autodoc]] DonutFeatureExtractor
@ -215,3 +231,8 @@ We refer to the [tutorial notebooks](https://github.com/NielsRogge/Transformers-

 [[autodoc]] DonutSwinModel
    - forward
+
+## DonutSwinForImageClassification
+
+[[autodoc]] transformers.DonutSwinForImageClassification
+    - forward
--- a/docs/source/en/model_doc/efficientnet.md
+++ b/docs/source/en/model_doc/efficientnet.md
@ -43,6 +43,11 @@ The original code can be found [here](https://github.com/tensorflow/tpu/tree/mas
 [[autodoc]] EfficientNetImageProcessor
    - preprocess

+## EfficientNetImageProcessorFast
+
+[[autodoc]] EfficientNetImageProcessorFast
+    - preprocess
+
 ## EfficientNetModel

 [[autodoc]] EfficientNetModel
--- a/docs/source/en/model_doc/electra.md
+++ b/docs/source/en/model_doc/electra.md
@ -45,9 +45,9 @@ import torch
 from transformers import pipeline

 classifier = pipeline(
-    task="text-classification", 
-    model="bhadresh-savani/electra-base-emotion", 
-    torch_dtype=torch.float16, 
+    task="text-classification",
+    model="bhadresh-savani/electra-base-emotion",
+    torch_dtype=torch.float16,
    device=0
 )
 classifier("This restaurant has amazing food!")
@ -64,7 +64,7 @@ tokenizer = AutoTokenizer.from_pretrained(
    "bhadresh-savani/electra-base-emotion",
 )
 model = AutoModelForSequenceClassification.from_pretrained(
-    "bhadresh-savani/electra-base-emotion", 
+    "bhadresh-savani/electra-base-emotion",
    torch_dtype=torch.float16
 )
 inputs = tokenizer("ELECTRA is more efficient than BERT", return_tensors="pt")
@ -78,10 +78,10 @@ print(f"Predicted label: {predicted_label}")
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
-echo -e "This restaurant has amazing food." | transformers-cli run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
+echo -e "This restaurant has amazing food." | transformers run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
 ```

 </hfoption>
@ -96,12 +96,12 @@ echo -e "This restaurant has amazing food." | transformers-cli run --task text-c

    ```py
    # Example of properly handling padding with attention masks
-    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"], 
-                    padding=True, 
+    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
+                    padding=True,
                    return_tensors="pt")
    outputs = model(**inputs)  # automatically uses the attention_mask
    ```
-    
+
 - When using the discriminator for a downstream task, you can load it into any of the ELECTRA model classes ([`ElectraForSequenceClassification`], [`ElectraForTokenClassification`], etc.).

 ## ElectraConfig
--- a/docs/source/en/model_doc/emu3.md
+++ b/docs/source/en/model_doc/emu3.md
@ -174,6 +174,10 @@ for i, image in enumerate(images['pixel_values']):
 [[autodoc]] Emu3TextModel
    - forward

+## Emu3Model
+
+[[autodoc]] Emu3Model
+
 ## Emu3ForCausalLM

 [[autodoc]] Emu3ForCausalLM
--- a/docs/source/en/model_doc/falcon.md
+++ b/docs/source/en/model_doc/falcon.md
@ -41,7 +41,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    task="text-generation", 
+    task="text-generation",
    model="tiiuae/falcon-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -76,11 +76,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers-cli">
+<hfoption id="transformers CLI">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat --model_name_or_path tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers chat tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -150,4 +150,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ## FalconForQuestionAnswering

 [[autodoc]] FalconForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -14,95 +14,100 @@ rendered properly in your Markdown viewer.

 -->

-# FalconMamba
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# FalconMamba

-The FalconMamba model was proposed by TII UAE (Technology Innovation Institute) in their release.
+[FalconMamba](https://huggingface.co/papers/2410.05355) is a 7B large language model, available as pretrained and instruction-tuned variants, based on the [Mamba](./mamba). This model implements a pure Mamba design that focuses on computational efficiency while maintaining strong performance. FalconMamba is significantly faster at inference and requires substantially less memory for long sequence generation. The models are pretrained on a diverse 5.8T token dataset including [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb), technical content, code, and mathematical data.

-The abstract from the paper is the following:
+You can find the official FalconMamba checkpoints in the [FalconMamba 7B](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) collection.

-*We present FalconMamba, a new base large language model based on the novel Mamba architecture. FalconMamba is trained on 5.8 trillion tokens with carefully selected data mixtures. As a pure Mamba-based model, FalconMamba surpasses leading open-weight models based on Transformers, such as Mistral 7B, Llama3 8B, and Falcon2 11B. It is on par with Gemma 7B and outperforms models with different architecture designs, such as RecurrentGemma 9B. Currently, FalconMamba is the best-performing Mamba model in the literature at this scale, surpassing both existing Mamba and hybrid Mamba-Transformer models.
-Due to its architecture, FalconMamba is significantly faster at inference and requires substantially less memory for long sequence generation. Despite recent studies suggesting that hybrid Mamba-Transformer models outperform pure architecture designs, we argue and demonstrate that the pure Mamba design can achieve similar, even superior results compared to the hybrid design. We make the weights of our implementation of FalconMamba publicly available under a permissive license.*
+> [!TIP]
+> Click on the FalconMamba models in the right sidebar for more examples of how to apply FalconMamba to different language tasks.

-Tips:
+The examples below demonstrate how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.

- FalconMamba is mostly based on Mamba architecture, the same [tips and best practices](./mamba) would be relevant here.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-The model has been trained on approximtely 6T tokens consisting a mixture of many data sources such as RefineWeb, Cosmopedia and Math data.
-
-For more details about the training procedure and the architecture, have a look at [the technical paper of FalconMamba]() (coming soon).
-
-# Usage
-
-Below we demonstrate how to use the model:
-
-```python 
-from transformers import FalconMambaForCausalLM, AutoTokenizer
+```py
 import torch
+from transformers import pipeline

-tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
-model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b")
-
-input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
-
-out = model.generate(input_ids, max_new_tokens=10)
-print(tokenizer.batch_decode(out))
+pipeline = pipeline(
+    "text-generation",
+    model="tiiuae/falcon-mamba-7b-instruct",
+    torch_dtype=torch.bfloat16,
+    device=0
+)
+pipeline(
+    "Explain the difference between transformers and SSMs",
+    max_length=100,
+    do_sample=True,
+    temperature=0.7
+)
 ```

-The architecture is also compatible with `torch.compile` for faster generation:
+</hfoption>
+<hfoption id="AutoModel">

-```python 
-from transformers import FalconMambaForCausalLM, AutoTokenizer
-import torch
-
-tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
-model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", torch_dtype=torch.bfloat16).to(0)
-model = torch.compile(model)
-
-input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
-
-out = model.generate(input_ids, max_new_tokens=10)
-print(tokenizer.batch_decode(out))
-```
-
-If you have access to a GPU that is compatible with `bitsandbytes`, you can also quantize the model in 4-bit precision:
-
-```python 
-from transformers import FalconMambaForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import torch
-
-tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", quantization_config=quantization_config)
-
-input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
-
-out = model.generate(input_ids, max_new_tokens=10)
-print(tokenizer.batch_decode(out))
-```
-
-You can also play with the instruction fine-tuned model:
-
-```python 
-from transformers import FalconMambaForCausalLM, AutoTokenizer
+```py
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
-model = FalconMambaForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
+model = AutoModelForCausalLM.from_pretrained(
+    "tiiuae/falcon-mamba-7b-instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)

-# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
-messages = [
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
-]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).input_ids
+input_ids = tokenizer("Explain the difference between transformers and SSMs", return_tensors="pt").to("cuda")

-outputs = model.generate(input_ids)
-print(tokenizer.decode(outputs[0]))
+output = model.generate(**input_ids, max_new_tokens=100, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+transformers chat tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits.
+
+```python
+import torch
+from transformers import AutoTokenizer, FalconMambaForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+)
+
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
+model = FalconMambaForCausalLM.from_pretrained(
+    "tiiuae/falcon-mamba-7b",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config,
+)
+
+inputs = tokenizer("Explain the concept of state space models in simple terms", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 ## FalconMambaConfig
--- a/docs/source/en/model_doc/flava.md
+++ b/docs/source/en/model_doc/flava.md
@ -72,6 +72,11 @@ This model was contributed by [aps](https://huggingface.co/aps). The original co
 [[autodoc]] FlavaImageProcessor
    - preprocess

+## FlavaImageProcessorFast
+
+[[autodoc]] FlavaImageProcessorFast
+    - preprocess
+
 ## FlavaForPreTraining

 [[autodoc]] FlavaForPreTraining
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -103,6 +103,10 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.

 [[autodoc]] FuyuConfig

+## FuyuModel
+
+[[autodoc]] FuyuModel
+
 ## FuyuForCausalLM

 [[autodoc]] FuyuForCausalLM
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -1,4 +1,5 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,31 +15,146 @@ rendered properly in your Markdown viewer.

 -->

-# Gemma
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Gemma

-The Gemma model was proposed in [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by Gemma Team, Google.
-Gemma models are trained on 6T tokens, and released with 2 versions, 2b and 7b.
+[Gemma](https://huggingface.co/papers/2403.08295) is a family of lightweight language models with pretrained and instruction-tuned variants, available in 2B and 7B parameters. The architecture is based on a transformer decoder-only design. It features Multi-Query Attention, rotary positional embeddings (RoPE), GeGLU activation functions, and RMSNorm layer normalization.

-The abstract from the paper is the following:
+The instruction-tuned variant was fine-tuned with supervised learning on instruction-following data, followed by reinforcement learning from human feedback (RLHF) to align the model outputs with human preferences.

-*This work introduces Gemma, a new family of open language models demonstrating strong performance across academic benchmarks for language understanding, reasoning, and safety. We release two sizes of models (2 billion and 7 billion parameters), and provide both pretrained and fine-tuned checkpoints. Gemma outperforms similarly sized open models on 11 out of 18 text-based tasks, and we present comprehensive evaluations of safety and responsibility aspects of the models, alongside a detailed description of our model development. We believe the responsible release of LLMs is critical for improving the safety of frontier models, and for enabling the next wave of LLM innovations*
+You can find all the original Gemma checkpoints under the [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) release.

-Tips:

- The original checkpoints can be converted using the conversion script `src/transformers/models/gemma/convert_gemma_weights_to_hf.py` 
+> [!TIP]
+> Click on the Gemma models in the right sidebar for more examples of how to apply Gemma to different language tasks.

-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi), [Pedro Cuenca](https://huggingface.co/pcuenq).
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class, and from the command line.

+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="google/gemma-2b",
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+)
+
+pipeline("LLMs generate text through a process known as", max_new_tokens=50)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2b",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+input_text = "LLMs generate text through a process known as"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=50, cache_implementation="static")
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "LLMs generate text through a process known as" | transformers run --task text-generation --model google/gemma-2b --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+#!pip install bitsandbytes
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-7b",
+    quantization_config=quantization_config,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+input_text = "LLMs generate text through a process known as."
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(
+    **input_ids,
+    max_new_tokens=50,
+    cache_implementation="static"
+)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+
+visualizer = AttentionMaskVisualizer("google/gemma-2b")
+visualizer("LLMs generate text through a process known as")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/gemma-attn-mask.png"/>
+</div>
+
+## Notes
+
+- The original Gemma models support standard kv-caching used in many transformer-based language models. You can use use the default [`DynamicCache`] instance or a tuple of tensors for past key values during generation. This makes it compatible with typical autoregressive generation workflows.
+
+   ```py
+   import torch
+   from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+   tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+   model = AutoModelForCausalLM.from_pretrained(
+       "google/gemma-2b",
+       torch_dtype=torch.bfloat16,
+       device_map="auto",
+       attn_implementation="sdpa"
+   )
+   input_text = "LLMs generate text through a process known as"
+   input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+   past_key_values = DynamicCache()
+   outputs = model.generate(**input_ids, max_new_tokens=50, past_key_values=past_key_values)
+   print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+   ```

 ## GemmaConfig

--- a/Show More
+++ b/Show More