Merge branch 'main' into fix-compressed-tensors

fix
2025-10-22 02:08:58 +08:00 · 2025-03-25 10:44:05 +01:00 · 2025-03-24 19:55:23 +01:00 · 2025-03-24 12:20:44 +01:00 · 2025-03-24 12:17:59 +01:00 · 2025-03-24 12:16:52 +01:00
2636 changed files with 126814 additions and 156140 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,18 +7,6 @@ parameters:
    nightly:
        type: boolean
        default: false
-    GHA_Actor:
-        type: string
-        default: ""
-    GHA_Action:
-        type: string
-        default: ""
-    GHA_Event:
-        type: string
-        default: ""
-    GHA_Meta:
-        type: string
-        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
@ -43,10 +31,8 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: python3 utils/extract_pr_number_from_circleci.py > pr_number.txt
-            - run: echo $(cat pr_number.txt)
-            - run: if [[ "$(cat pr_number.txt)" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$(cat pr_number.txt) >> github.txt'
+            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
+            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
            - run: cat github.txt
            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
@ -168,7 +154,7 @@ jobs:
                  path: ~/transformers/installed.txt
            - run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
            - run: ruff check examples tests src utils
-            - run: ruff format examples tests src utils --check
+            - run: ruff format tests src utils --check
            - run: python utils/custom_init_isort.py --check_only
            - run: python utils/sort_auto_mappings.py --check_only
            - run: python utils/check_doc_toc.py
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,8 +28,6 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
-    # will be adjust in `CircleCIJob.to_dict`.
-    "RUN_FLAKY": True,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
 COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
@ -110,7 +108,6 @@ class CircleCIJob:
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
            self.install_steps = ["uv venv && uv pip install ."]
-        self.install_steps.append("uv venv && uv pip install git+https://github.com/ydshieh/pytest.git@8.3.5-ydshieh git+https://github.com/ydshieh/pluggy.git@1.5.0-ydshieh")
        if self.pytest_options is None:
            self.pytest_options = {}
        if isinstance(self.tests_to_run, str):
@ -129,8 +126,6 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
-        # Do not run tests decorated by @is_flaky on pull requests
-        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
@ -176,7 +171,6 @@ class CircleCIJob:
                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
                    }
            },
-            {"run": {"name": "fetch hub objects before pytest", "command": "python3 utils/fetch_hub_objects_for_ci.py"}},
            {"run": {
                "name": "Run tests",
                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
@ -212,9 +206,6 @@ torch_job = CircleCIJob(
 generate_job = CircleCIJob(
    "generate",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
-    # networkx==3.3 (after #36957) cause some issues
-    # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
    marker="generate",
    parallelism=6,
 )
@ -337,9 +328,6 @@ repo_utils_job = CircleCIJob(
 non_model_job = CircleCIJob(
    "non_model",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
-    # networkx==3.3 (after #36957) cause some issues
-    # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
    marker="not generate",
    parallelism=6,
 )
@ -369,9 +357,9 @@ doc_test_job = CircleCIJob(
    pytest_num_workers=1,
 )

-REGULAR_TESTS = [torch_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
-EXAMPLES_TESTS = [examples_torch_job]
-PIPELINE_TESTS = [pipelines_torch_job]
+REGULAR_TESTS = [torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
+EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
+PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
@ -398,12 +386,7 @@ def create_circleci_config(folder=None):
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            # Only used to accept the parameters from GitHub Actions trigger
-            "GHA_Actor": {"type": "string", "default": ""},
-            "GHA_Action": {"type": "string", "default": ""},
-            "GHA_Event": {"type": "string", "default": ""},
-            "GHA_Meta": {"type": "string", "default": ""},
-            "tests_to_run": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ''},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
@ -48,20 +48,14 @@ body:
          - pipelines: @Rocketknight1
          - tensorflow: @gante and @Rocketknight1
          - tokenizers: @ArthurZucker and @itazap
-          - trainer: @zach-huggingface @SunMarc
+          - trainer: @muellerzr @SunMarc

        Integrations:

-          - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
+          - deepspeed: HF Trainer/Accelerate: @muellerzr
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
          - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
-        
-        Devices/Backends:
-        
-          - AMD ROCm: @ivarflakstad
-          - Intel XPU: @IlyasMoutawwakil
-          - Ascend NPU: @ivarflakstad 

        Documentation: @stevhliu

--- a/.github/ISSUE_TEMPLATE/i18n.md
+++ b/.github/ISSUE_TEMPLATE/i18n.md
@ -23,7 +23,7 @@ Some notes:
 * Please translate in a gender-neutral way.
 * Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source).
 * Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml).
-* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu for review.
+* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu and @MKhalusova for review.
 * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/).

 ## Get Started section
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -51,12 +51,12 @@ Library:
 - pipelines: @Rocketknight1
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
- trainer: @zach-huggingface and @SunMarc
+- trainer: @muellerzr and @SunMarc
 - chat templates: @Rocketknight1

 Integrations:

- deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
+- deepspeed: HF Trainer/Accelerate: @muellerzr
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
 - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
--- a/.github/scripts/assign_reviewers.py
+++ b/.github/scripts/assign_reviewers.py
@ -54,21 +54,6 @@ def get_file_owners(file_path, codeowners_lines):
            return owners  # Remember, can still be empty!
    return []  # Should never happen, but just in case

-def pr_author_is_in_hf(pr_author, codeowners_lines):
-    # Check if the PR author is in the codeowners file
-    for line in codeowners_lines:
-        line = line.split('#')[0].strip()
-        if not line:
-            continue
-
-        # Split into pattern and owners
-        parts = line.split()
-        owners = [owner.removeprefix("@") for owner in parts[1:]]
-
-        if pr_author in owners:
-            return True
-    return False
-
 def main():
    script_dir = Path(__file__).parent.absolute()
    with open(script_dir / "codeowners_for_review_action") as f:
@ -83,9 +68,6 @@ def main():
    pr_number = event['pull_request']['number']
    pr = repo.get_pull(pr_number)
    pr_author = pr.user.login
-    if pr_author_is_in_hf(pr_author, codeowners_lines):
-        print(f"PR author {pr_author} is in codeowners, skipping review request.")
-        return

    existing_reviews = list(pr.get_reviews())
    if existing_reviews:
--- a/.github/scripts/codeowners_for_review_action
+++ b/.github/scripts/codeowners_for_review_action
@ -14,7 +14,7 @@ docs/ @stevhliu
 # Owners of subsections of the library
 /src/transformers/generation/ @gante
 /src/transformers/pipeline/ @Rocketknight1 @yonigozlan
-/src/transformers/integrations/ @SunMarc @MekkCyber @zach-huggingface
+/src/transformers/integrations/ @SunMarc @MekkCyber @muellerzr
 /src/transformers/quantizers/ @SunMarc @MekkCyber
 tests/ @ydshieh
 tests/generation/ @gante
@ -27,8 +27,8 @@ tests/generation/ @gante
 # Specific files come after the sections/globs, so they take priority
 /.circleci/config.yml @ArthurZucker @ydshieh
 /utils/tests_fetcher.py @ydshieh
-trainer.py @zach-huggingface @SunMarc
-trainer_utils.py @zach-huggingface @SunMarc
+trainer.py @muellerzr @SunMarc
+trainer_utils.py @muellerzr @SunMarc
 /utils/modular_model_converter.py @Cyrilvallez @ArthurZucker

 # Owners of individual models are specific / high priority, and so they come last
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -63,14 +63,14 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
    steps:
      -
        name: Set up Docker Buildx
@ -99,7 +99,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -140,7 +140,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -176,7 +176,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-doc-builder docker build
+          title: 🤗 Results of the huggingface/transformers-doc-builder docker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -214,7 +214,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -223,19 +223,19 @@ jobs:
    runs-on:
      group: aws-general-8-plus
    steps:
-      -
+      - 
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      -
+      - 
        name: Check out code
        uses: actions/checkout@v4
-      -
+      - 
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
+      - 
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -263,7 +263,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -301,7 +301,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
+          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -310,19 +310,19 @@ jobs:
    runs-on:
      group: aws-general-8-plus
    steps:
-      -
+      - 
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      -
+      - 
        name: Check out code
        uses: actions/checkout@v4
-      -
+      - 
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
+      - 
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -350,7 +350,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
+          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -388,6 +388,6 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-quantization-latest-gpu build
+          title: 🤗 Results of the transformers-quantization-latest-gpu build 
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@ -42,7 +42,7 @@ jobs:
  nightly-torch-deepspeed-docker:
    name: "Nightly PyTorch + DeepSpeed"
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
    steps:
      -
        name: Set up Docker Buildx
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,4 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: en
+      languages: ar de en es fr hi it ko pt tr zh ja te
--- a/.github/workflows/change_pr_to_draft.yml
+++ b/.github/workflows/change_pr_to_draft.yml
@ -0,0 +1,25 @@
+name: Change PR to draft
+
+on:
+  pull_request_target:
+    types: [opened, reopened]
+
+jobs:
+  convert_pr_to_draft:
+    runs-on: ubuntu-22.04
+    name: Convert PR to draft
+    permissions:
+      pull-requests: write
+      contents: write
+    if: github.event.pull_request.draft == false
+    steps:
+      - name: Convert PR to draft
+        shell: bash
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+        run: |
+          echo $PR_NUMBER
+          gh pr ready $PR_NUMBER --repo $REPO --undo
+          gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. The CI will be paused while the PR is in draft mode. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page). This will assign reviewers."
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -29,7 +29,7 @@ jobs:
  run_models_gpu:
    name: " "
    runs-on:
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -18,10 +18,6 @@ on:
      docker:
        required: true
        type: string
-      report_name_prefix:
-        required: false
-        default: run_models_gpu
-        type: string

 env:
  HF_HOME: /mnt/cache
@ -107,7 +103,7 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -120,23 +116,23 @@ jobs:

      - name: Run all tests on GPU
        working-directory: /transformers
-        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
+        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt

      - name: Run test
        shell: bash
        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
+          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -59,7 +59,7 @@ jobs:
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
                  }
                }
              ]
--- a/.github/workflows/pr-style-bot.yml
+++ b/.github/workflows/pr-style-bot.yml
@ -1,19 +0,0 @@
-# To run this bot, comment "@bot /style" on a PR
-name: Style Bot
-
-on:
-  issue_comment:
-    types: [created]
-
-permissions:
-  contents: write
-  pull-requests: write
-
-jobs:
-  style:
-    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
-    with:
-      python_quality_dependencies: "[quality]"
-      style_command_type: "default"
-    secrets:
-      bot_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -145,7 +145,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,7 +239,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,7 +338,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -54,23 +54,12 @@ jobs:
      ci_event: Daily CI
    secrets: inherit

-  trainer-fsdp-ci:
-    name: Trainer/FSDP CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_trainer_and_fsdp_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      runner: daily-ci
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
  deepspeed-ci:
    name: DeepSpeed CI
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
+      slack_report_channel: "#transformers-ci-daily-deepspeed"
      runner: daily-ci
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -45,11 +45,11 @@ env:

 jobs:
  setup:
-    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
+    if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -77,17 +77,12 @@ jobs:
        run: pip freeze

      - id: set-matrix
-        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+        if: ${{ inputs.job == 'run_models_gpu' }}
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
-          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
-          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
-            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
-            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
-          fi
+          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT

      - id: set-matrix-quantization
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
@ -107,7 +102,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -118,32 +113,13 @@ jobs:
      docker: ${{ inputs.docker }}
    secrets: inherit

-  run_trainer_and_fsdp_gpu:
-    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
-    name: " "
-    needs: setup
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
-        slice_id: [0, 1]
-    uses: ./.github/workflows/model_jobs.yml
-    with:
-      folder_slices: ${{ needs.setup.outputs.folder_slices }}
-      machine_type: ${{ matrix.machine_type }}
-      slice_id: ${{ matrix.slice_id }}
-      runner: ${{ inputs.runner }}
-      docker: ${{ inputs.docker }}
-      report_name_prefix: run_trainer_and_fsdp_gpu
-    secrets: inherit
-
  run_pipelines_torch_gpu:
    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: PyTorch pipelines
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -177,7 +153,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -211,7 +187,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -246,7 +222,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -280,7 +256,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -314,7 +290,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -349,7 +325,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -406,12 +382,12 @@ jobs:
        run: pip freeze

      - name: Set `machine_type` for report and artifact names
-        working-directory: ${{ inputs.working-directory-prefix }}/transformers
+        working-directory: /transformers
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -448,7 +424,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -491,7 +467,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -565,7 +541,6 @@ jobs:
    needs: [
      setup,
      run_models_gpu,
-      run_trainer_and_fsdp_gpu,
      run_pipelines_torch_gpu,
      run_pipelines_tf_gpu,
      run_examples_gpu,
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -35,7 +35,7 @@ jobs:
        shell: bash
        run: |
          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers env
+transformers-cli env
 ```

 You can also run the same command from the root of the repository:
--- a/ISSUES.md
+++ b/ISSUES.md
@ -26,7 +26,7 @@ There are two main venues to receive support: [the forums](https://discuss.huggi

 [The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.

-If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystallized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
+If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).

 In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:

@ -263,9 +263,9 @@ You are not required to read the following guidelines before opening an issue. H
    But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:

    ```
-    > How big is your GPU cluster?
+    > How big is your gpu cluster?

-    Our cluster is made of 256 GPUs.
+    Our cluster is made of 256 gpus.
    ```

    If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
--- a/2
+++ b/2
@ -79,7 +79,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py --fix_and_overwrite
+	python utils/check_modular_conversion.py  --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
--- a/README.md
+++ b/README.md
@ -70,7 +70,7 @@ Explore the [Hub](https://huggingface.com/) today to find a model and use Transf

 ## Installation

-Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.
+Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.0+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.

 Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.

@ -78,6 +78,7 @@ Create and activate a virtual environment with [venv](https://docs.python.org/3/
 # venv
 python -m venv .my-env
 source .my-env/bin/activate
+
 # uv
 uv venv .my-env
 source .my-env/bin/activate
@ -87,10 +88,10 @@ Install Transformers in your virtual environment.

 ```py
 # pip
-pip install "transformers[torch]"
+pip install transformers

 # uv
-uv pip install "transformers[torch]"
+uv pip install transformers
 ```

 Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
@ -98,12 +99,7 @@ Install Transformers from source if you want the latest changes in the library o
 ```shell
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-
-# pip
-pip install .[torch]
-
-# uv
-uv pip install .[torch]
+pip install .
 ```

 ## Quickstart
@ -125,7 +121,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
 > [!TIP]
 > You can also chat with a model directly from the command line.
 > ```shell
-> transformers chat Qwen/Qwen2.5-0.5B-Instruct
+> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 > ```

 ```py
--- a/SECURITY.md
+++ b/SECURITY.md
@ -27,6 +27,13 @@ These models require the `trust_remote_code=True` parameter to be set when using
 the content of the modeling files when using this argument. We recommend setting a revision in order to ensure you
 protect yourself from updates on the repository.

+#### Tools
+
+Through the `Agent` framework, remote tools can be downloaded to be used by the Agent. You're to specify these tools
+yourself, but please keep in mind that their code will be run on your machine if the Agent chooses to run them.
+
+Please inspect the code of the tools before passing them to the Agent to protect your runtime and local setup.
+
 ## Reporting a Vulnerability

 Feel free to submit vulnerability reports to [security@huggingface.co](mailto:security@huggingface.co), where someone from the HF security team will review and recommend next steps. If reporting a vulnerability specific to open source, please note [Huntr](https://huntr.com) is a vulnerability disclosure program for open source software.
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -12,7 +12,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

 ## Writing metrics to the database

-`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.

 cf [`llama.py`](./llama.py) to see an example of this in practice.

--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):

        model = benchmark.config.backend["model"]

-        # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
        benchmark_name = str(Path(benchmark_name).parts[-1])
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -3,6 +3,7 @@ import importlib.util
 import logging
 import os
 from typing import Dict
+import psycopg2
 import sys

 from psycopg2.extras import Json
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -118,7 +118,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
        with torch.no_grad():
            past_key_values = StaticCache(
                model.config,
-                max_batch_size=batch_size,
+                batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + num_tokens_to_generate,
@ -144,7 +144,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                max_batch_size=batch_size,
+                batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + num_tokens_to_generate,
@ -187,7 +187,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            # TODO use  decode_one_token(model, input_id.clone(), cache_position) for verification
            past_key_values = StaticCache(
                model.config,
-                max_batch_size=batch_size,
+                batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + num_tokens_to_generate + 10,
@ -204,7 +204,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            time_to_first_token = end - start
            logger.info(f"completed first compile generation in: {time_to_first_token}s")
            cache_position += 1
-            all_generated_tokens += next_token.tolist()
+            all_generated_tokens += next_token.clone().detach().cpu().tolist()

            cache_position = torch.tensor([seq_length], device=device)
            ### First compile, decoding
@ -215,9 +215,9 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            torch.cuda.synchronize()
            end = perf_counter()
            time_to_second_token = end - start
-            logger.info(f"completed second compile generation in: {time_to_second_token}s")
+            logger.info(f"completed second compile generation in: {time_to_first_token}s")
            cache_position += 1
-            all_generated_tokens += next_token.tolist()
+            all_generated_tokens += next_token.clone().detach().cpu().tolist()

            ### Second compile, decoding
            start = perf_counter()
@ -227,15 +227,15 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            torch.cuda.synchronize()
            end = perf_counter()
            time_to_third_token = end - start
-            logger.info(f"completed third compile forward in: {time_to_third_token}s")
+            logger.info(f"completed third compile forward in: {time_to_first_token}s")
            cache_position += 1
-            all_generated_tokens += next_token.tolist()
+            all_generated_tokens += next_token.clone().detach().cpu().tolist()

            ### Using cuda graphs decoding

            start = perf_counter()
            for _ in range(1, num_tokens_to_generate):
-                all_generated_tokens += next_token.tolist()
+                all_generated_tokens += next_token.clone().detach().cpu().tolist()
                next_token = decode_one_token(
                    model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
                )
@ -254,7 +254,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                max_batch_size=batch_size,
+                batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
@ -271,7 +271,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                max_batch_size=batch_size,
+                batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
@ -287,23 +287,23 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                max_batch_size=batch_size,
+                batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
            )

-            # 3rd call
+            # 3nd call
            start = perf_counter()
            output = model.generate(**inputs, past_key_values=past_key_values)
            end = perf_counter()
            third_compile_generate_time = end - start
-            logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
+            logger.info(f"completed second compile generation in: {third_compile_generate_time}s")
            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")

            past_key_values = StaticCache(
                model.config,
-                max_batch_size=batch_size,
+                batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
@ -313,7 +313,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            output = model.generate(**inputs, past_key_values=past_key_values)
            end = perf_counter()
            fourth_compile_generate_time = end - start
-            logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
+            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")

        metrics_recorder.collect_model_measurements(
--- a/conftest.py
+++ b/conftest.py
@ -46,6 +46,10 @@ NOT_DEVICE_TESTS = {
    "test_keep_in_fp32_modules",
    "test_gradient_checkpointing_backward_compatibility",
    "test_gradient_checkpointing_enable_disable",
+    "test_save_load_fast_init_from_base",
+    "test_fast_init_context_manager",
+    "test_fast_init_tied_embeddings",
+    "test_save_load_fast_init_to_base",
    "test_torch_save_load",
    "test_initialization",
    "test_forward_signature",
@ -66,6 +70,7 @@ NOT_DEVICE_TESTS = {
    "ModelTester::test_pipeline_",
    "/repo_utils/",
    "/utils/",
+    "/agents/",
 }

 # allow having multiple repository checkouts and not needing to remember to rerun
@ -82,6 +87,7 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
+    config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")


--- a/docker/README.md
+++ b/docker/README.md
@ -2,8 +2,8 @@

 In this folder you will find various docker files, and some subfolders. 
 - dockerfiles (ex: `consistency.dockerfile`) present under `~/docker` are used for our "fast" CIs. You should be able to use them for tasks that only need CPU. For example `torch-light` is a very light weights container (703MiB). 
- subfolders contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)
+- subfloder contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)

 Note that in both case, you need to run `uv pip install -e .`, which should take around 5 seconds. We do it outside the dockerfile for the need of our CI: we checkout a new branch each time, and the `transformers` code is thus updated. 

-We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
+We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,12 +5,12 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
 RUN git lfs install

-RUN uv pip uninstall transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -16,12 +16,12 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
 RUN python3 -m unidic download
-RUN uv pip uninstall transformers
+RUN pip uninstall -y transformers

 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
 RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@ -7,7 +7,7 @@ RUN apt-get install -y g++ cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
-RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
-RUN uv pip uninstall transformers
+RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,8 +5,8 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
-RUN uv pip uninstall transformers
+RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,13 +5,13 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
-RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
+RUN pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
 # RUN git clone https://github.com/facebookresearch/detectron2.git
 # RUN python3 -m pip install --no-cache-dir -e detectron2
-RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
-RUN uv pip uninstall transformers
+RUN pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3'
+RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@ -5,6 +5,6 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
-RUN uv pip uninstall transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@ -5,6 +5,6 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
+RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
-RUN uv pip uninstall transformers
+RUN pip uninstall -y transformers
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@ -6,4 +6,4 @@ RUN apt-get update && apt-get install -y time git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv &&  uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
-RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@ -6,7 +6,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 RUN apt-get install -y  cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
-RUN uv pip uninstall transformers
+RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@ -6,11 +6,11 @@ RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"


 # RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"

-RUN uv pip uninstall transformers
+RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
-RUN uv pip uninstall transformers
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words]"
+RUN pip uninstall -y transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,13 +7,13 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
-RUN uv pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
+RUN pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" librosa


-RUN uv pip uninstall transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -14,8 +14,6 @@ ARG PYTORCH='2.6.0'
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu121'
-# Disable kernel mapping for now until all tests pass
-ENV DISABLE_KERNEL_MAPPING=1

 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
@ -59,8 +57,7 @@ RUN python3 -m pip uninstall -y ninja

 # For `dinat` model
 # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
-# pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'`
-RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels
+RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels

 # For `nougat` tokenizer
 RUN python3 -m pip install --no-cache-dir python-Levenshtein
@ -71,9 +68,6 @@ RUN python3 -m pip install --no-cache-dir g2p-en
 # For Some bitsandbytes tests
 RUN python3 -m pip install --no-cache-dir einops

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -1,12 +1,12 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
+FROM nvcr.io/nvidia/pytorch:23.11-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

-ARG PYTORCH='2.6.0'
+ARG PYTORCH='2.2.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu126'
+ARG CUDA='cu121'

 RUN apt -y update
 RUN apt install -y libaio-dev
@ -15,8 +15,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
-RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
+RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]

 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
@ -45,9 +44,6 @@ RUN python3 -m pip uninstall -y deepspeed
 # TODO: Find out why test fail.
 RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -1,11 +1,11 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
-FROM nvcr.io/nvidia/pytorch:24.08-py3
+FROM nvcr.io/nvidia/pytorch:23.11-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu126'
+ARG CUDA='cu121'

 RUN apt -y update
 RUN apt install -y libaio-dev
@ -21,8 +21,7 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
 RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

-# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
-RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
+RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -57,9 +56,6 @@ RUN python3 -m pip uninstall -y deepspeed
 #RUN git clone https://github.com/pytorch/TensorRT.git
 #RUN cd TensorRT/py && python3 setup.py install --fx-only

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -28,9 +28,6 @@ RUN python3 -m pip uninstall -y tensorflow flax
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -12,8 +12,6 @@ SHELL ["sh", "-lc"]
 ARG PYTORCH='2.6.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu121'
-# Disable kernel mapping for quantization tests
-ENV DISABLE_KERNEL_MAPPING=1

 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
@ -84,15 +82,9 @@ RUN python3 -m pip install --no-cache-dir compressed-tensors
 # Add AMD Quark for quantization testing
 RUN python3 -m pip install --no-cache-dir amd-quark

-# Add AutoRound for quantization testing
-RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
-
 # Add transformers in editable mode
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -23,6 +23,8 @@
    title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT
  - local: model_sharing
    title: مشاركة نموذجك
+  - local: agents
+    title: الوكلاء
  - local: llm_tutorial
    title: التوليد باستخدام LLMs
  - local: conversations
@ -250,6 +252,8 @@
  title: أطر مفاهيمية
 # - sections:
 #   - sections:
+#     - local: main_classes/agent
+#       title: الوكلاء والأدوات
 #     - local: model_doc/auto
 #       title: فئات يتم إنشاؤها ديناميكيًا
 #     - local: main_classes/backbones
--- a/docs/source/ar/agents.md
+++ b/docs/source/ar/agents.md
@ -0,0 +1,539 @@
+# الوكلاء والأدوات
+
+[[open-in-colab]]
+
+### ما هو الوكيل؟
+
+يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها.
+
+يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل".
+
+الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات".
+
+هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح.
+
+يمكن برمجة الوكيل للقيام بما يلي:
+- وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال
+- التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال
+
+### أنواع الوكلاء
+
+#### الوكيل البرمجي (Code agent)
+
+يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط.
+
+#### وكلاء التفاعل
+
+هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة.
+
+نقوم بتنفيذ إصدارين من ReactJsonAgent: 
+- [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها.
+- [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء  قوي في البرمجة.
+
+> [!TIP]
+> اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct.
+
+![إطار عمل وكيل ReAct](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
+
+على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي.
+
+```py3
+>>> agent.run(
+...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+... )
+=====New task=====
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
+====Agent is executing the code below:
+bert_blocks = search(query="number of blocks in BERT base encoder")
+print("BERT blocks:", bert_blocks)
+====
+Print outputs:
+BERT blocks: twelve encoder blocks
+
+====Agent is executing the code below:
+attention_layer = search(query="number of layers in Attention is All You Need")
+print("Attention layers:", attention_layer)
+====
+Print outputs:
+Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
+
+====Agent is executing the code below:
+bert_blocks = 12
+attention_layers = 6
+diff = bert_blocks - attention_layers
+print("Difference in blocks:", diff)
+final_answer(diff)
+====
+
+Print outputs:
+Difference in blocks: 6
+
+Final answer: 6
+```
+
+### كيف يمكنني بناء وكيل؟
+
+لتهيئة وكيل، تحتاج إلى هذه الوسائط:
+
+- نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له.
+- موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته.
+- صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها
+- محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها
+
+عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا.
+
+للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية.
+
+```bash
+pip install transformers[agents]
+```
+
+قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد.
+
+```python
+from huggingface_hub import login, InferenceClient
+
+login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
+
+client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
+
+def llm_engine(messages, stop_sequences=["Task"]) -> str:
+    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
+    answer = response.choices[0].message.content
+    return answer
+```
+
+يمكنك استخدام أي طريقة `llm_engine` طالما أنها:
+1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str`
+2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop`
+
+أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`.
+
+الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى.
+
+```python
+from transformers import CodeAgent, HfEngine
+
+llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run(
+    "Could you translate this sentence from French, say it out loud and return the audio.",
+    sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], add_base_tools=True)
+
+agent.run(
+    "Could you translate this sentence from French, say it out loud and give me the audio.",
+    sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج.
+
+يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها:
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+```
+
+
+تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك.
+
+```python
+print(agent.system_prompt_template)
+```
+
+من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها.
+كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا.
+يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`.
+
+
+#### تنفيذ التعليمات البرمجية
+
+يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك.
+يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه.
+
+مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة.
+يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل  `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]:
+
+```py
+>>> from transformers import ReactCodeAgent
+
+>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
+
+(...)
+'Hugging Face – Blog'
+```
+
+سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل.
+
+> [!WARNING]
+> يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة!
+
+### موجه النظام
+
+ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً).
+
+```text
+You will be given a task to solve as best you can.
+You have access to the following tools:
+<<tool_descriptions>>
+
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
+Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
+
+In the end you have to return a final answer using the `final_answer` tool.
+
+Here are a few examples using notional tools:
+---
+{examples}
+
+Above example were using notional tools that might not exist for you. You only have access to those tools:
+<<tool_names>>
+You also can perform computations in the python code you generate.
+
+Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
+
+Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
+Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
+
+Remember to make sure that variables you use are all defined.
+
+Now Begin!
+```
+
+يتضمن موجه النظام:
+- *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها.
+- وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<<tool_descriptions>>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها.
+    - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه.
+- شكل المخرج المتوقع.
+
+يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات.
+
+للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`.
+
+```python
+from transformers import ReactJsonAgent
+from transformers.agents import PythonInterpreterTool
+
+agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
+```
+
+> [!WARNING]
+> يرجى التأكد من تحديد سلسلة `<<tool_descriptions>>` في مكان ما في `template` حتى يكون الوكيل على علم 
+بالأدوات المتاحة.
+
+
+### فحص تشغيل الوكيل
+
+فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل:
+- تخزن  `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`.
+- تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة.
+
+## الأدوات
+
+الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة.
+
+يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة.
+
+عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا.
+
+### صندوق الأدوات الافتراضي
+
+يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`:
+
+- **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut))
+- **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt))
+- **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper))
+- **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5))
+- **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف.
+- **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python
+لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS:
+
+يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها.
+
+```python
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### إنشاء أداة جديدة
+
+يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face.
+على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub.
+
+سوف نبدأ بالكود التالي.
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`].
+
+تحتاج الأداة المخصصة إلى:
+
+- اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`.
+- تستخدم خاصية `description` لملء موجه نظام الوكيل.
+- خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات.
+- خاصية `output_type`، والتي تحدد نوع المخرج.
+- طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية.
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = (
+        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+        "It returns the name of the checkpoint."
+    )
+
+    inputs = {
+        "task": {
+            "type": "text",
+            "description": "the task category (such as text-classification, depth-estimation, etc)",
+        }
+    }
+    output_type = "text"
+
+    def forward(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام.
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة.
+
+```python
+tool.push_to_hub("{your_username}/hf-model-downloads")
+```
+
+قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك.
+
+```python
+from transformers import load_tool, CodeAgent
+
+model_download_tool = load_tool("m-ric/hf-model-downloads")
+agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
+agent.run(
+    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+
+ستحصل على ما يلي:
+
+```text
+======== New task ========
+Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
+==== Agent is executing the code below:
+most_downloaded_model = model_download_counter(task="text-to-video")
+print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
+====
+```
+
+والناتج:
+
+`"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."`
+
+### إدارة صندوق أدوات الوكيل الخاص بك
+
+إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة.
+
+دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+agent.toolbox.add_tool(model_download_tool)
+```
+
+الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة:
+
+```python
+    agent.run(
+        "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+    )
+```
+
+| **Audio**                                                                                                                                            |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
+
+> [!WARNING]
+> احترس عند إضافة أدوات إلى وكيل يعمل بالفعل لأنه يمكن أن يؤثر على اختيار الأداة لصالح أداتك أو اختيار أداة أخرى غير المحددة بالفعل.
+
+استخدم طريقة `agent.toolbox.update_tool()` لاستبدال أداة موجودة في صندوق أدوات الوكيل.
+هذا مفيد إذا كانت أداتك الجديدة بديلاً مباشرًا للأداة الموجودة لأن الوكيل يعرف بالفعل كيفية تنفيذ تلك المهمة المحددة.
+تأكد فقط من اتباع الأداة الجديدة لنفس واجهة برمجة التطبيقات (API) للأداة المستبدلة أو قم بتكييف قالب موجه النظام لضمان تحديث جميع الأمثلة التي تستخدم الأداة المستبدلة.
+
+### استخدام مجموعة من الأدوات
+
+يمكنك الاستفادة من مجموعات الأدوات باستخدام كائن ToolCollection، مع تحديد مجموعة الأدوات التي تريد استخدامها.
+ثم قم بتمريرها كقائمة لتهيئة الوكيل الخاص بك، وبدء استخدامها!
+
+```py
+from transformers import ToolCollection, ReactCodeAgent
+
+image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
+agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
+
+agent.run("Please draw me a picture of rivers and lakes.")
+```
+
+لتسريع البداية، يتم تحميل الأدوات فقط إذا استدعاها الوكيل.
+
+ستحصل على هذه الصورة:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" />
+
+### استخدام gradio-tools
+
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools) هي مكتبة قوية تتيح استخدام Hugging
+Face Spaces كأدوات. تدعم العديد من المساحات الموجودة بالإضافة إلى مساحات مخصصة.
+
+تدعم مكتبة Transformers `gradio_tools` باستخدام طريقة [`Tool.from_gradio`] في الفئة. على سبيل المثال، دعنا نستخدم [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) من مجموعة أدوات `gradio-tools` لتحسين المطالبات لإنشاء صور أفضل.
+
+استورد وقم بتهيئة الأداة، ثم مررها إلى طريقة `Tool.from_gradio`:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+from transformers import Tool, load_tool, CodeAgent
+
+gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
+prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
+```
+
+الآن يمكنك استخدامه مثل أي أداة أخرى. على سبيل المثال، دعنا نحسن الموجه `a rabbit wearing a space suit`.
+
+```python
+image_generation_tool = load_tool('huggingface-tools/text-to-image')
+agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine)
+
+agent.run(
+    "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
+)
+```
+
+يستفيد النموذج بشكل كافٍ من الأداة:
+
+```text
+======== New task ========
+Improve this prompt, then generate an image of it.
+You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}.
+==== Agent is executing the code below:
+improved_prompt = StableDiffusionPromptGenerator(query=prompt)
+while improved_prompt == "QUEUE_FULL":
+    improved_prompt = StableDiffusionPromptGenerator(query=prompt)
+print(f"The improved prompt is {improved_prompt}.")
+image = image_generator(prompt=improved_prompt)
+====
+```
+
+قبل إنشاء الصورة أخيرًا:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp" />
+
+> [!WARNING]
+> تتطلب gradio-tools إدخالات وإخراجات *نصية* حتى عند العمل مع طرائق مختلفة مثل كائنات الصور والصوت. الإدخالات والإخراجات الصورية والصوتية غير متوافقة حاليًا.
+
+### استخدام أدوات LangChain
+
+نحن نحب Langchain ونعتقد أنها تحتوي على مجموعة أدوات قوية للغاية.
+لاستيراد أداة من LangChain، استخدم الطريقة `from_langchain()`.
+
+فيما يلي كيفية استخدامها لإعادة إنشاء نتيجة البحث في المقدمة باستخدام أداة بحث الويب LangChain.
+
+```python
+from langchain.agents import load_tools
+from transformers import Tool, ReactCodeAgent
+
+search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
+
+agent = ReactCodeAgent(tools=[search_tool])
+
+agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
+```
+
+## واجهة Gradio
+
+يمكنك الاستفادة من `gradio.Chatbot` لعرض أفكار الوكيل الخاص بك باستخدام `stream_to_gradio`، إليك مثال:
+
+```py
+import gradio as gr
+from transformers import (
+    load_tool,
+    ReactCodeAgent,
+    HfEngine,
+    stream_to_gradio,
+)
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+
+llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
+
+# Initialize the agent with the image generation tool
+agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+
+def interact_with_agent(task):
+    messages = []
+    messages.append(gr.ChatMessage(role="user", content=task))
+    yield messages
+    for msg in stream_to_gradio(agent, task):
+        messages.append(msg)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+        ]
+    yield messages
+
+
+with gr.Blocks() as demo:
+    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+    submit = gr.Button("Run illustrator agent!")
+    chatbot = gr.Chatbot(
+        label="Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+    demo.launch()
+```
--- a/docs/source/ar/gguf.md
+++ b/docs/source/ar/gguf.md
@ -77,7 +77,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)

 الآن لديك إمكانية الوصول إلى النسخة الكامل غير المكممة للنموذج في بيئة PyTorch، حيث يمكنك دمجه مع مجموعة كبيرة من الأدوات الأخرى.

-لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) من llama.cpp.
+لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) من llama.cpp.

 فيما يلي كيفية إكمال البرنامج النصي أعلاه لحفظ النموذج وإعادة تصديره مرة أخرى إلى `gguf`:

--- a/docs/source/ar/trainer.md
+++ b/docs/source/ar/trainer.md
@ -674,7 +674,29 @@ use_cpu: false
 ```

 </hfoption>
+<hfoption id="Tensor Parallelism with PyTorch 2">

+```yml
+compute_environment: LOCAL_MACHINE
+tp_config:
+  tp_size: 4
+distributed_type: TP
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+
+```
+
+</hfoption>
 </hfoptions>
 يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.

--- a/docs/source/de/_toctree.yml
+++ b/docs/source/de/_toctree.yml
@ -23,6 +23,8 @@
    title: Laden und Trainieren von Adaptern mit 🤗 PEFT
  - local: model_sharing
    title: Ein Modell teilen
+  - local: transformers_agents
+    title: Agents
  - local: llm_tutorial
    title: Generation with LLMs
  title: Tutorials
@ -37,4 +39,4 @@
    title: Testen
  - local: pr_checks
    title: Überprüfung einer Pull Request
-  title: Contribute
+  title: Contribute
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -95,7 +95,7 @@ wie der Code geschrieben werden sollte :-)
 1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from).
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
 2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
@ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d
 ein bestehendes Modell:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb
 Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:

 ```bash
-transformers env
+transformers-cli env
 ```

 Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
--- a/docs/source/de/transformers_agents.md
+++ b/docs/source/de/transformers_agents.md
@ -0,0 +1,323 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformers Agents
+
+<Tip warning={true}>
+
+Transformers Agents ist eine experimentelle API, die jederzeit geändert werden kann. Die von den Agenten zurückgegebenen Ergebnisse
+zurückgegeben werden, können variieren, da sich die APIs oder die zugrunde liegenden Modelle ändern können.
+
+</Tip>
+
+Transformers Version v4.29.0, die auf dem Konzept von *Tools* und *Agenten* aufbaut. Sie können damit spielen in
+[dieses Colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
+
+Kurz gesagt, es bietet eine API für natürliche Sprache auf der Grundlage von Transformers: Wir definieren eine Reihe von kuratierten Tools und entwerfen einen 
+Agenten, um natürliche Sprache zu interpretieren und diese Werkzeuge zu verwenden. Es ist von vornherein erweiterbar; wir haben einige relevante Tools kuratiert, 
+aber wir werden Ihnen zeigen, wie das System einfach erweitert werden kann, um jedes von der Community entwickelte Tool zu verwenden.
+
+Beginnen wir mit einigen Beispielen dafür, was mit dieser neuen API erreicht werden kann. Sie ist besonders leistungsfähig, wenn es um 
+Sie ist besonders leistungsstark, wenn es um multimodale Aufgaben geht. Lassen Sie uns also eine Runde drehen, um Bilder zu erzeugen und Text vorzulesen.
+
+```py
+agent.run("Caption the following image", image=image)
+```
+
+| **Input**                                                                                                                   | **Output**                        |
+|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
+
+---
+
+```py
+agent.run("Read the following text out loud", text=text)
+```
+| **Input**                                                                                                               | **Output**                                   |
+|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
+| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
+
+---
+
+```py
+agent.run(
+    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
+    document=document,
+)
+```
+| **Input**                                                                                                                   | **Output**     |
+|-----------------------------------------------------------------------------------------------------------------------------|----------------|
+| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
+
+## Schnellstart
+
+Bevor Sie `agent.run` verwenden können, müssen Sie einen Agenten instanziieren, der ein großes Sprachmodell (LLM) ist. 
+Wir bieten Unterstützung für openAI-Modelle sowie für OpenSource-Alternativen von BigCode und OpenAssistant. Die openAI
+Modelle sind leistungsfähiger (erfordern aber einen openAI-API-Schlüssel, können also nicht kostenlos verwendet werden); Hugging Face
+bietet kostenlosen Zugang zu Endpunkten für BigCode- und OpenAssistant-Modelle.
+
+To start with, please install the `agents` extras in order to install all default dependencies.
+```bash
+pip install transformers[agents]
+```
+
+Um openAI-Modelle zu verwenden, instanziieren Sie einen [`OpenAiAgent`], nachdem Sie die `openai`-Abhängigkeit installiert haben:
+
+```bash
+pip install openai
+```
+
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
+```
+
+Um BigCode oder OpenAssistant zu verwenden, melden Sie sich zunächst an, um Zugriff auf die Inference API zu erhalten:
+
+```py
+from huggingface_hub import login
+
+login("<YOUR_TOKEN>")
+```
+
+Dann instanziieren Sie den Agenten
+
+```py
+from transformers import HfAgent
+
+# Starcoder
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+# StarcoderBase
+# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
+# OpenAssistant
+# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
+```
+
+Dies geschieht mit der Inferenz-API, die Hugging Face derzeit kostenlos zur Verfügung stellt. Wenn Sie Ihren eigenen Inferenz
+Endpunkt für dieses Modell (oder einen anderen) haben, können Sie die obige URL durch Ihren URL-Endpunkt ersetzen.
+
+<Tip>
+
+StarCoder und OpenAssistant sind kostenlos und leisten bei einfachen Aufgaben bewundernswert gute Arbeit. Allerdings halten die Kontrollpunkte
+nicht, wenn es um komplexere Aufforderungen geht. Wenn Sie mit einem solchen Problem konfrontiert sind, empfehlen wir Ihnen, das OpenAI
+Modell auszuprobieren, das zwar leider nicht quelloffen ist, aber zur Zeit eine bessere Leistung erbringt.
+
+</Tip>
+
+Sie sind jetzt startklar! Lassen Sie uns in die beiden APIs eintauchen, die Ihnen jetzt zur Verfügung stehen.
+
+### Einzelne Ausführung (run)
+
+Die Methode der einmaligen Ausführung ist die Verwendung der [`~Agent.run`] Methode des Agenten:
+
+```py
+agent.run("Draw me a picture of rivers and lakes.")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
+
+Es wählt automatisch das (oder die) Werkzeug(e) aus, das (die) für die von Ihnen gewünschte Aufgabe geeignet ist (sind) und führt es (sie) entsprechend aus. Es
+kann eine oder mehrere Aufgaben in der gleichen Anweisung ausführen (je komplexer Ihre Anweisung ist, desto wahrscheinlicher ist ein
+der Agent scheitern).
+
+```py
+agent.run("Draw me a picture of the sea then transform the picture to add an island")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
+
+<br/>
+
+
+Jede [`~Agent.run`] Operation ist unabhängig, so dass Sie sie mehrmals hintereinander mit unterschiedlichen Aufgaben ausführen können.
+
+Beachten Sie, dass Ihr `Agent` nur ein großsprachiges Modell ist, so dass kleine Variationen in Ihrer Eingabeaufforderung völlig unterschiedliche Ergebnisse liefern können.
+unterschiedliche Ergebnisse liefern. Es ist wichtig, dass Sie die Aufgabe, die Sie ausführen möchten, so genau wie möglich erklären. Wir gehen noch weiter ins Detail
+wie man gute Prompts schreibt [hier](custom_tools#writing-good-user-inputs).
+
+Wenn Sie einen Status über Ausführungszeiten hinweg beibehalten oder dem Agenten Nicht-Text-Objekte übergeben möchten, können Sie dies tun, indem Sie
+Variablen, die der Agent verwenden soll. Sie könnten zum Beispiel das erste Bild von Flüssen und Seen erzeugen, 
+und das Modell bitten, dieses Bild zu aktualisieren und eine Insel hinzuzufügen, indem Sie Folgendes tun:
+
+```python
+picture = agent.run("Generate a picture of rivers and lakes.")
+updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
+```
+
+<Tip>
+
+Dies kann hilfreich sein, wenn das Modell Ihre Anfrage nicht verstehen kann und die Werkzeuge verwechselt. Ein Beispiel wäre:
+
+```py
+agent.run("Draw me the picture of a capybara swimming in the sea")
+```
+
+Hier könnte das Modell auf zwei Arten interpretieren:
+- Die Funktion `Text-zu-Bild` erzeugt ein Wasserschwein, das im Meer schwimmt.
+- Oder Sie lassen das `Text-zu-Bild` ein Wasserschwein erzeugen und verwenden dann das Werkzeug `Bildtransformation`, um es im Meer schwimmen zu lassen.
+
+Falls Sie das erste Szenario erzwingen möchten, können Sie dies tun, indem Sie die Eingabeaufforderung als Argument übergeben:
+
+```py
+agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
+```
+
+</Tip>
+
+
+### Chat-basierte Ausführung (Chat)
+
+Der Agent verfügt auch über einen Chat-basierten Ansatz, der die Methode [`~Agent.chat`] verwendet:
+
+```py
+agent.chat("Generate a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+```py
+agent.chat("Transform the picture so that there is a rock in there")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
+
+<br/>
+
+Dies ist ein interessanter Ansatz, wenn Sie den Zustand über Anweisungen hinweg beibehalten möchten. Er ist besser für Experimente geeignet, 
+eignet sich aber eher für einzelne Anweisungen als für komplexe Anweisungen (die die [`~Agent.run`]
+Methode besser verarbeiten kann).
+
+Diese Methode kann auch Argumente entgegennehmen, wenn Sie Nicht-Text-Typen oder bestimmte Aufforderungen übergeben möchten.
+
+### ⚠️ Fernausführung
+
+Zu Demonstrationszwecken und damit es mit allen Setups verwendet werden kann, haben wir Remote-Executors für mehrere 
+der Standard-Tools erstellt, auf die der Agent in dieser Version Zugriff hat. Diese werden erstellt mit 
+[inference endpoints](https://huggingface.co/inference-endpoints).
+
+Wir haben diese vorerst deaktiviert, aber um zu sehen, wie Sie selbst Remote Executors Tools einrichten können,
+empfehlen wir die Lektüre des [custom tool guide](./custom_tools).
+
+### Was passiert hier? Was sind Tools und was sind Agenten?
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
+
+#### Agenten
+
+Der "Agent" ist hier ein großes Sprachmodell, das wir auffordern, Zugang zu einem bestimmten Satz von Tools zu erhalten.
+
+LLMs sind ziemlich gut darin, kleine Codeproben zu erzeugen. Diese API macht sich das zunutze, indem sie das 
+LLM ein kleines Codebeispiel gibt, das eine Aufgabe mit einer Reihe von Werkzeugen ausführt. Diese Aufforderung wird dann ergänzt durch die 
+Aufgabe, die Sie Ihrem Agenten geben, und die Beschreibung der Werkzeuge, die Sie ihm geben. Auf diese Weise erhält er Zugriff auf die Dokumentation der 
+Tools, insbesondere die erwarteten Eingaben und Ausgaben, und kann den entsprechenden Code generieren.
+
+#### Tools
+
+Tools sind sehr einfach: Sie bestehen aus einer einzigen Funktion mit einem Namen und einer Beschreibung. Wir verwenden dann die Beschreibungen dieser Tools 
+um den Agenten aufzufordern. Anhand der Eingabeaufforderung zeigen wir dem Agenten, wie er die Tools nutzen kann, um das zu tun, was in der 
+in der Abfrage angefordert wurde.
+
+Dies geschieht mit brandneuen Tools und nicht mit Pipelines, denn der Agent schreibt besseren Code mit sehr atomaren Tools. 
+Pipelines sind stärker refaktorisiert und fassen oft mehrere Aufgaben in einer einzigen zusammen. Tools sind dafür gedacht, sich auf
+eine einzige, sehr einfache Aufgabe konzentrieren.
+
+#### Code-Ausführung?!
+
+Dieser Code wird dann mit unserem kleinen Python-Interpreter auf den mit Ihren Tools übergebenen Eingaben ausgeführt. 
+Wir hören Sie schon schreien "Willkürliche Codeausführung!", aber lassen Sie uns erklären, warum das nicht der Fall ist.
+
+Die einzigen Funktionen, die aufgerufen werden können, sind die von Ihnen zur Verfügung gestellten Tools und die Druckfunktion, so dass Sie bereits eingeschränkt sind 
+eingeschränkt, was ausgeführt werden kann. Sie sollten sicher sein, wenn es sich auf die Werkzeuge für das Umarmungsgesicht beschränkt. 
+
+Dann lassen wir keine Attributsuche oder Importe zu (die ohnehin nicht benötigt werden, um die 
+Inputs/Outputs an eine kleine Gruppe von Funktionen), so dass alle offensichtlichen Angriffe (und Sie müssten den LLM 
+dazu auffordern, sie auszugeben) kein Problem darstellen sollten. Wenn Sie auf Nummer sicher gehen wollen, können Sie die 
+run()-Methode mit dem zusätzlichen Argument return_code=True ausführen. In diesem Fall gibt der Agent nur den auszuführenden Code 
+zur Ausführung zurück und Sie können entscheiden, ob Sie ihn ausführen möchten oder nicht.
+
+Die Ausführung bricht bei jeder Zeile ab, in der versucht wird, eine illegale Operation auszuführen, oder wenn ein regulärer Python-Fehler 
+mit dem vom Agenten generierten Code.
+
+### Ein kuratierter Satz von Tools
+
+Wir haben eine Reihe von Tools identifiziert, die solche Agenten unterstützen können. Hier ist eine aktualisierte Liste der Tools, die wir integriert haben 
+in `transformers` integriert haben:
+
+- **Beantwortung von Fragen zu Dokumenten**: Beantworten Sie anhand eines Dokuments (z.B. PDF) im Bildformat eine Frage zu diesem Dokument ([Donut](./model_doc/donut))
+- Beantworten von Textfragen**: Geben Sie einen langen Text und eine Frage an, beantworten Sie die Frage im Text ([Flan-T5](./model_doc/flan-t5))
+- **Unbedingte Bildunterschriften**: Beschriften Sie das Bild! ([BLIP](./model_doc/blip))
+- **Bildfragebeantwortung**: Beantworten Sie bei einem Bild eine Frage zu diesem Bild ([VILT](./model_doc/vilt))
+- **Bildsegmentierung**: Geben Sie ein Bild und einen Prompt an und geben Sie die Segmentierungsmaske dieses Prompts aus ([CLIPSeg](./model_doc/clipseg))
+- **Sprache in Text**: Geben Sie eine Audioaufnahme einer sprechenden Person an und transkribieren Sie die Sprache in Text ([Whisper](./model_doc/whisper))
+- **Text in Sprache**: wandelt Text in Sprache um ([SpeechT5](./model_doc/speecht5))
+- **Zero-Shot-Textklassifizierung**: Ermitteln Sie anhand eines Textes und einer Liste von Bezeichnungen, welcher Bezeichnung der Text am ehesten entspricht ([BART](./model_doc/bart))
+- **Textzusammenfassung**: fassen Sie einen langen Text in einem oder wenigen Sätzen zusammen ([BART](./model_doc/bart))
+- **Übersetzung**: Übersetzen des Textes in eine bestimmte Sprache ([NLLB](./model_doc/nllb))
+
+Diese Tools sind in Transformatoren integriert und können auch manuell verwendet werden, zum Beispiel:
+
+```py
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### Benutzerdefinierte Tools
+
+Wir haben zwar eine Reihe von Tools identifiziert, sind aber der festen Überzeugung, dass der Hauptwert dieser Implementierung darin besteht 
+die Möglichkeit, benutzerdefinierte Tools schnell zu erstellen und weiterzugeben.
+
+Indem Sie den Code eines Tools in einen Hugging Face Space oder ein Modell-Repository stellen, können Sie das Tool 
+direkt mit dem Agenten nutzen. Wir haben ein paar neue Funktionen hinzugefügt 
+**transformers-agnostic** Tools zur [`huggingface-tools` Organisation](https://huggingface.co/huggingface-tools) hinzugefügt:
+
+- **Text-Downloader**: zum Herunterladen eines Textes von einer Web-URL
+- **Text zu Bild**: erzeugt ein Bild nach einer Eingabeaufforderung und nutzt dabei stabile Diffusion
+- **Bildtransformation**: verändert ein Bild anhand eines Ausgangsbildes und einer Eingabeaufforderung, unter Ausnutzung der stabilen pix2pix-Diffusion
+- **Text zu Video**: Erzeugen eines kleinen Videos nach einer Eingabeaufforderung, unter Verwendung von damo-vilab
+
+Das Text-zu-Bild-Tool, das wir von Anfang an verwendet haben, ist ein Remote-Tool, das sich in 
+[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! Wir werden
+weiterhin solche Tools für diese und andere Organisationen veröffentlichen, um diese Implementierung weiter zu verbessern.
+
+Die Agenten haben standardmäßig Zugriff auf die Tools, die sich auf [*huggingface-tools*](https://huggingface.co/huggingface-tools) befinden.
+Wie Sie Ihre eigenen Tools schreiben und freigeben können und wie Sie jedes benutzerdefinierte Tool, das sich auf dem Hub befindet, nutzen können, erklären wir in [folgender Anleitung](custom_tools).
+
+### Code-Erzeugung
+
+Bisher haben wir gezeigt, wie Sie die Agenten nutzen können, um Aktionen für Sie durchzuführen. Der Agent generiert jedoch nur Code
+den wir dann mit einem sehr eingeschränkten Python-Interpreter ausführen. Falls Sie den generierten Code in einer anderen Umgebung verwenden möchten 
+einer anderen Umgebung verwenden möchten, können Sie den Agenten auffordern, den Code zusammen mit einer Tooldefinition und genauen Importen zurückzugeben.
+
+Zum Beispiel die folgende Anweisung
+```python
+agent.run("Draw me a picture of rivers and lakes", return_code=True)
+```
+
+gibt den folgenden Code zurück
+
+```python
+from transformers import load_tool
+
+image_generator = load_tool("huggingface-tools/text-to-image")
+
+image = image_generator(prompt="rivers and lakes")
+```
+
+die Sie dann selbst ändern und ausführen können.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -21,8 +21,6 @@
      title: Adding a new model to Transformers
    - local: modular_transformers
      title: Modular Transformers
-    - local: auto_docstring
-      title: Document your models
    - local: task_summary
      title: What 🤗 Transformers can do
    - local: tasks_explained
@ -31,16 +29,12 @@
      title: The Transformer model family
    - local: attention
      title: Attention mechanisms
-    - local: attention_interface
-      title: Customizing attention function
    title: Models
  - sections:
    - local: fast_tokenizers
      title: Tokenizers
    - local: image_processors
      title: Image processors
-    - local: video_processors
-      title: Video processors
    - local: backbones
      title: Backbones
    - local: feature_extractors
@ -153,8 +147,6 @@
      title: TPU
    - local: perf_train_special
      title: Apple Silicon
-    - local: perf_train_gaudi
-      title: Intel Gaudi
    - local: perf_hardware
      title: Build your own machine
    title: Hardware
@ -167,14 +159,8 @@
  sections:
  - local: quantization/overview
    title: Overview
-  - local: quantization/selecting
-    title: Selecting a quantization method
-  - local: quantization/concept_guide
-    title: Quantization concepts
  - local: quantization/aqlm
    title: AQLM
-  - local: quantization/auto_round
-    title: AutoRound
  - local: quantization/awq
    title: AWQ
  - local: quantization/bitnet
@ -291,8 +277,6 @@
        title: Image-text-to-text
      - local: tasks/video_text_to_text
        title: Video-text-to-text
-      - local: tasks/visual_document_retrieval
-        title: Visual Document Retrieval
      title: Multimodal
    title: Task recipes
  - local: run_scripts
@ -320,6 +304,8 @@
 - isExpanded: false
  sections:
  - sections:
+    - local: main_classes/agent
+      title: Agents and Tools
    - local: model_doc/auto
      title: Auto Classes
    - local: main_classes/backbones
@ -364,9 +350,7 @@
      title: Feature Extractor
    - local: main_classes/image_processor
      title: Image Processor
-    - local: main_classes/video_processor
-      title: Video Processor
-    title: Main Classes
+    title: Main classes
  - sections:
    - sections:
      - local: model_doc/albert
@ -393,8 +377,6 @@
        title: BigBirdPegasus
      - local: model_doc/biogpt
        title: BioGpt
-      - local: model_doc/bitnet
-        title: BitNet
      - local: model_doc/blenderbot
        title: Blenderbot
      - local: model_doc/blenderbot-small
@ -431,8 +413,6 @@
        title: DeBERTa
      - local: model_doc/deberta-v2
        title: DeBERTa-v2
-      - local: model_doc/deepseek_v3
-        title: DeepSeek-V3
      - local: model_doc/dialogpt
        title: DialoGPT
      - local: model_doc/diffllama
@ -477,8 +457,6 @@
        title: Gemma2
      - local: model_doc/glm
        title: GLM
-      - local: model_doc/glm4
-        title: glm4
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
@ -501,16 +479,14 @@
        title: Granite
      - local: model_doc/granitemoe
        title: GraniteMoe
-      - local: model_doc/granitemoehybrid
-        title: GraniteMoeHybrid
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
+      - local: model_doc/granitevision
+        title: GraniteVision
      - local: model_doc/helium
        title: Helium
      - local: model_doc/herbert
        title: HerBERT
-      - local: model_doc/hgnet_v2
-        title: HGNet-V2
      - local: model_doc/ibert
        title: I-BERT
      - local: model_doc/jamba
@ -555,6 +531,8 @@
        title: MegatronGPT2
      - local: model_doc/mistral
        title: Mistral
+      - local: model_doc/mistral3
+        title: Mistral3
      - local: model_doc/mixtral
        title: Mixtral
      - local: model_doc/mluke
@ -605,6 +583,8 @@
        title: Phi
      - local: model_doc/phi3
        title: Phi-3
+      - local: model_doc/phi4_multimodal
+        title: Phi4 Multimodal
      - local: model_doc/phimoe
        title: PhiMoE
      - local: model_doc/phobert
@ -619,10 +599,6 @@
        title: Qwen2
      - local: model_doc/qwen2_moe
        title: Qwen2MoE
-      - local: model_doc/qwen3
-        title: Qwen3
-      - local: model_doc/qwen3_moe
-        title: Qwen3MoE
      - local: model_doc/rag
        title: RAG
      - local: model_doc/realm
@ -703,8 +679,6 @@
        title: ConvNeXTV2
      - local: model_doc/cvt
        title: CvT
-      - local: model_doc/d_fine
-        title: D-FINE
      - local: model_doc/dab-detr
        title: DAB-DETR
      - local: model_doc/deformable_detr
@ -751,8 +725,6 @@
        title: Mask2Former
      - local: model_doc/maskformer
        title: MaskFormer
-      - local: model_doc/mlcd
-        title: MLCD
      - local: model_doc/mobilenet_v1
        title: MobileNetV1
      - local: model_doc/mobilenet_v2
@ -831,16 +803,12 @@
        title: Bark
      - local: model_doc/clap
        title: CLAP
-      - local: model_doc/csm
-        title: CSM
      - local: model_doc/dac
        title: dac
      - local: model_doc/encodec
        title: EnCodec
      - local: model_doc/fastspeech2_conformer
        title: FastSpeech2Conformer
-      - local: model_doc/granite_speech
-        title: GraniteSpeech
      - local: model_doc/hubert
        title: Hubert
      - local: model_doc/mctct
@ -951,8 +919,6 @@
        title: GIT
      - local: model_doc/got_ocr2
        title: GOT-OCR2
-      - local: model_doc/granitevision
-        title: GraniteVision
      - local: model_doc/grounding-dino
        title: Grounding DINO
      - local: model_doc/groupvit
@ -967,10 +933,6 @@
        title: InstructBLIP
      - local: model_doc/instructblipvideo
        title: InstructBlipVideo
-      - local: model_doc/internvl
-        title: InternVL
-      - local: model_doc/janus
-        title: Janus
      - local: model_doc/kosmos-2
        title: KOSMOS-2
      - local: model_doc/layoutlm
@ -983,8 +945,6 @@
        title: LayoutXLM
      - local: model_doc/lilt
        title: LiLT
-      - local: model_doc/llama4
-        title: Llama4
      - local: model_doc/llava
        title: Llava
      - local: model_doc/llava_next
@ -999,8 +959,6 @@
        title: MatCha
      - local: model_doc/mgp-str
        title: MGP-STR
-      - local: model_doc/mistral3
-        title: Mistral3
      - local: model_doc/mllama
        title: mllama
      - local: model_doc/nougat
@ -1017,14 +975,10 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
-      - local: model_doc/phi4_multimodal
-        title: Phi4 Multimodal
      - local: model_doc/pix2struct
        title: Pix2Struct
      - local: model_doc/pixtral
        title: Pixtral
-      - local: model_doc/qwen2_5_omni
-        title: Qwen2.5-Omni
      - local: model_doc/qwen2_5_vl
        title: Qwen2.5-VL
      - local: model_doc/qwen2_audio
@ -1033,8 +987,6 @@
        title: Qwen2VL
      - local: model_doc/sam
        title: Segment Anything
-      - local: model_doc/sam_hq
-        title: Segment Anything High Quality
      - local: model_doc/shieldgemma2
        title: ShieldGemma2
      - local: model_doc/siglip
@ -1087,8 +1039,6 @@
        title: PatchTST
      - local: model_doc/time_series_transformer
        title: Time Series Transformer
-      - local: model_doc/timesfm
-        title: TimesFM
      title: Time series models
    - sections:
      - local: model_doc/graphormer
@ -1114,8 +1064,6 @@
      title: Utilities for Audio processing
    - local: internal/file_utils
      title: General Utilities
-    - local: internal/import_utils
-      title: Importing Utilities
    - local: internal/time_series_utils
      title: Utilities for Time Series
    title: Internal helpers
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -161,7 +161,7 @@ The downside is that if you aren't used to them, it may take some time to get us
 Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 ## Create a pull request
@ -292,7 +292,7 @@ Once you're able to run the original checkpoint, you're ready to start adapting

 ## Adapt the model code

-The `transformers add-new-model-like` command should have generated a model and configuration file.
+The `transformers-cli add-new-model-like` command should have generated a model and configuration file.

 - `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
 - `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
@ -551,10 +551,10 @@ While this example doesn't include an image processor, you may need to implement

 If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.

-Run the following command (only if you haven't already created the fast image processor with the `transformers add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
+Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.

 ```bash
-transformers add-fast-image-processor --model-name your_model_name
+transformers-cli add-fast-image-processor --model-name your_model_name
 ```

 This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -15,4 +15,283 @@ rendered properly in your Markdown viewer.
 -->

 > [!WARNING]
-> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
+> Agents and tools are being spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. These docs will be deprecated in the future!
+
+# Agents
+
+[[open-in-colab]]
+
+An agent is a system where a large language model (LLM) can execute more complex tasks through *planning* and using *tools*.
+
+- Planning helps a LLM reason its way through a task by breaking it down into smaller subtasks. For example, [`CodeAgent`] plans a series of actions to take and then generates Python code to execute all the actions at once.
+
+    Another planning method is by self-reflection and refinement of its previous actions to improve its performance. The [`ReactJsonAgent`] is an example of this type of planning, and it's based on the [ReAct](https://hf.co/papers/2210.03629) framework. This agent plans and executes actions one at a time based on the feedback it receives from each action.
+
+- Tools give a LLM access to external functions or APIs that it can use to help it complete a task. For example, [gradio-tools](https://github.com/freddyaboulton/gradio-tools) gives a LLM access to any of the [Gradio](https://www.gradio.app/) apps available on Hugging Face [Spaces](https://hf.co/spaces). These apps can be used for a wide range of tasks such as image generation, video generation, audio transcription, and more.
+
+To use agents in Transformers, make sure you have the extra `agents` dependencies installed.
+
+```bash
+!pip install transformers[agents]
+```
+
+Create an agent instance (refer to the [Agents](./main_classes/agent#agents) API for supported agents in Transformers) and a list of tools available for it to use, then [`~ReactAgent.run`] the agent on your task. The example below demonstrates how a ReAct agent reasons through a task.
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[])
+agent.run(
+    "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+)
+```
+
+```bash
+======== New task ========
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
+==== Agent is executing the code below:
+bert_layers = 12  # BERT base encoder has 12 layers
+attention_layers = 6  # Encoder in Attention is All You Need has 6 layers
+layer_diff = bert_layers - attention_layers
+print("The difference in layers between BERT base encoder and Attention is All You Need is", layer_diff)
+====
+Print outputs:
+The difference in layers between BERT base encoder and Attention is All You Need is 6
+
+==== Agent is executing the code below:
+final_answer("BERT base encoder has {} more layers than the encoder from Attention is All You Need.".format(layer_diff))
+====
+Print outputs:
+
+>>> Final answer:
+BERT base encoder has 6 more layers than the encoder from Attention is All You Need.
+```
+
+This guide will walk you through in more detail how to initialize an agent.
+
+## LLM
+
+An agent uses a LLM to plan and execute a task; it is the engine that powers the agent. To choose and build your own LLM engine, you need a method that:
+
+1. the input uses the [chat template](./chat_templating) format, `List[Dict[str, str]]`, and it returns a string
+2. the LLM stops generating outputs when it encounters the sequences in `stop_sequences`
+
+```py
+def llm_engine(messages, stop_sequences=["Task"]) -> str:
+    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
+    answer = response.choices[0].message.content
+    return answer
+```
+
+Next, initialize an engine to load a model. To run an agent locally, create a [`TransformersEngine`] to load a preinitialized [`Pipeline`].
+
+However, you could also leverage Hugging Face's powerful inference infrastructure, [Inference API](https://hf.co/docs/api-inference/index) or [Inference Endpoints](https://hf.co/docs/inference-endpoints/index), to run your model. This is useful for loading larger models that are typically required for agentic behavior. In this case, load the [`HfApiEngine`] to run the agent.
+
+The agent requires a list of tools it can use to complete a task. If you aren't using any additional tools, pass an empty list. The default tools provided by Transformers are loaded automatically, but you can optionally set `add_base_tools=True` to explicitly enable them.
+
+<hfoptions id="engine">
+<hfoption id="TransformersEngine">
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine, CodeAgent
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct").to("cuda")
+pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
+llm_engine = TransformersEngine(pipeline)
+agent = CodeAgent(tools=[], llm_engine=llm_engine)
+agent.run(
+    "What causes bread to rise?",
+)
+```
+
+</hfoption>
+<hfoption id="HfApiEngine">
+
+```py
+from transformers import CodeAgent, HfApiEngine
+
+llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine)
+agent.run(
+    "Could you translate this sentence from French, say it out loud and return the audio.",
+    sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+</hfoption>
+</hfoptions>
+
+The agent supports [constrained generation](https://hf.co/docs/text-generation-inference/conceptual/guidance) for generating outputs according to a specific structure with the `grammar` parameter. The `grammar` parameter should be specified in the `llm_engine` method or you can set it when initializing an agent.
+
+Lastly, an agent accepts additional inputs such as text and audio. In the [`HfApiEngine`] example above, the agent accepted a sentence to translate. But you could also pass a path to a local or remote file for the agent to access. The example below demonstrates how to pass a path to an audio file.
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine)
+agent.run("Why doesn't he know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+```
+
+## System prompt
+
+A system prompt describes how an agent should behave, a description of the available tools, and the expected output format.
+
+Tools are defined by the `<<tool_descriptions>>` token which is dynamically replaced during runtime with the actual tool. The tool description is derived from the tool name, description, inputs, output type, and a Jinja2 template. Refer to the [Tools](./tools) guide for more information about how to describe tools.
+
+The example below is the system prompt for [`ReactCodeAgent`].
+
+```py
+You will be given a task to solve as best you can.
+You have access to the following tools:
+<<tool_descriptions>>
+
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
+Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
+
+In the end you have to return a final answer using the `final_answer` tool.
+
+Here are a few examples using notional tools:
+---
+{examples}
+
+Above example were using notional tools that might not exist for you. You only have access to those tools:
+<<tool_names>>
+You also can perform computations in the python code you generate.
+
+Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
+
+Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
+Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
+
+Remember to make sure that variables you use are all defined.
+
+Now Begin!
+```
+
+The system prompt can be tailored to the intended task. For example, you can add a better explanation of the output format or you can overwrite the system prompt template entirely with your own custom system prompt as shown below.
+
+> [!WARNING]
+> If you're writing a custom system prompt, make sure to include `<<tool_descriptions>>` in the template so the agent is aware of the available tools.
+
+```py
+from transformers import ReactJsonAgent
+from transformers.agents import PythonInterpreterTool
+
+agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
+```
+
+## Code execution
+
+For safety, only the tools you provide (and the default Transformers tools) and the `print` function are executed. The interpreter doesn't allow importing modules that aren't on a safe list.
+
+To import modules that aren't on the list, add them as a list to the `additional_authorized_imports` parameter when initializing an agent.
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
+```
+
+Code execution stops if a tool isn't on the safe list, it isn't authorized, or if the code generated by the agent returns a Python error.
+
+> [!WARNING]
+> A LLM can generate any arbitrary code that can be executed, so don't add any unsafe imports!
+
+## Multi-agent
+
+[Multi-agent](https://hf.co/papers/2308.08155) refers to multiple agents working together to solve a task. Performance is typically better because each agent is specialized for a particular subtask.
+
+Multi-agents are created through a [`ManagedAgent`] class, where a *manager agent* oversees how other agents work together. The manager agent requires an agent and their name and description. These are added to the manager agents system prompt which lets it know how to call and use them.
+
+The multi-agent example below creates a web search agent that is managed by another [`ReactCodeAgent`].
+
+```py
+from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
+
+llm_engine = HfApiEngine()
+web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
+managed_web_agent = ManagedAgent(
+    agent=web_agent,
+    name="web_search",
+    description="Runs web searches for you. Give it your query as an argument."
+)
+manager_agent = ReactCodeAgent(
+    tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
+)
+manager_agent.run("Who is the CEO of Hugging Face?")
+```
+
+## Gradio integration
+
+[Gradio](https://www.gradio.app/) is a library for quickly creating and sharing machine learning apps. The [gradio.Chatbot](https://www.gradio.app/docs/gradio/chatbot) supports chatting with a Transformers agent with the [`stream_to_gradio`] function.
+
+Load a tool and LLM with an agent, and then create a Gradio app. The key is to use [`stream_to_gradio`] to stream the agents messages and display how it's reasoning through a task.
+
+```py
+import gradio as gr
+from transformers import (
+    load_tool,
+    ReactCodeAgent,
+    HfApiEngine,
+    stream_to_gradio,
+)
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct")
+
+# Initialize the agent with the image generation tool
+agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+def interact_with_agent(task):
+    messages = []
+    messages.append(gr.ChatMessage(role="user", content=task))
+    yield messages
+    for msg in stream_to_gradio(agent, task):
+        messages.append(msg)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+        ]
+    yield messages
+
+with gr.Blocks() as demo:
+    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+    submit = gr.Button("Run illustrator agent!")
+    chatbot = gr.Chatbot(
+        label="Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+    demo.launch()
+```
+
+## Troubleshoot
+
+For a better idea of what is happening when you call an agent, it is always a good idea to check the system prompt template first.
+
+```py
+print(agent.system_prompt_template)
+```
+
+If the agent is behaving unexpectedly, remember to explain the task you want to perform as clearly as possible. Every [`~Agent.run`] is different and minor variations in your system prompt may yield completely different results.
+
+To find out what happened after a run, check the following agent attributes.
+
+- `agent.logs` stores the finegrained agent logs. At every step of the agents run, everything is stored in a dictionary and appended to `agent.logs`.
+- `agent.write_inner_memory_from_logs` only stores a high-level overview of the agents run. For example, at each step, it stores the LLM output as a message and the tool call output as a separate message. Not every detail from a step is transcripted by `write_inner_memory_from_logs`.
+
+## Resources
+
+Learn more about ReAct agents in the [Open-source LLMs as LangChain Agents](https://hf.co/blog/open-source-llms-as-agents) blog post.
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -1,128 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Attention Interface
-
-This page describes how to use the `AttentionInterface` in order to register custom attention functions to use with
-supported models.
-
-## Customizing attention function
-
-Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
-By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
-[`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
-as well as `eager`, which is a simple matrix multiplication without any optimization on top.  
-This is the setting you can usually choose when instantiating a model:
-
-```python
-from transformers import AutoModelForCausalLM
-
-model_id = "meta-llama/Llama-3.2-1B"
-
-# Here, using flash attention as an example
-model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
-```
-
-But what if you wanted to create your own attention function? Or simply play around with existing ones, adding
-a few statements here and there? You can now do so with the `AttentionInterface`! Here is an example:
-
-```python
-from transformers import AutoModelForCausalLM, AttentionInterface
-from transformers.integrations.sdpa_attention import sdpa_attention_forward
-import torch
-
-model_id = "meta-llama/Llama-3.2-1B"
-
-def my_new_sdpa(*args, **kwargs):
-    print("I just entered the attention computation")
-    return sdpa_attention_forward(*args, **kwargs)
-
-AttentionInterface.register("my_new_sdpa", my_new_sdpa)
-
-model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_new_sdpa")
-# Try running the forward with the new attention function
-model(torch.ones(1, 5, dtype=int))
-```
-
-You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
-
-## Dynamically switching attention function
-
-You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
-
-```python
-# Back to use original sdpa implementation
-model.config._attn_implementation = "sdpa"
-
-model(torch.ones(1, 5, dtype=int))
-```
-
-and it will stop printing the statements, as it now uses the `sdpa` attention.  
-This allows to quickly change an attention function, without needing to reload the model!
-
-## What about new args needed in my custom attention function?
-
-But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
-`AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
-you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
-
-```python
-from transformers import AutoModelForCausalLM, AttentionInterface
-from transformers.integrations.sdpa_attention import sdpa_attention_forward
-import torch
-
-def custom_attention(
-    module: torch.nn.Module,  # required arg
-    query: torch.Tensor,  # required arg
-    key: torch.Tensor,  # required arg
-    value: torch.Tensor,  # required arg
-    attention_mask: Optional[torch.Tensor],  # required arg
-    a_new_kwargs = None,  # You can now add as many kwargs as you need
-    another_new_kwargs = None,  # You can now add as many kwargs as you need
-    **kwargs,  # You need to accept **kwargs as models will pass other args
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
-    ...  # do your magic!
-    return attn_output, attn_weights  # attn_weights are optional here
-
-AttentionInterface.register("custom", custom_attention)
-
-model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom")
-# Forward pass with the new kwargs
-model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
-```
-
-If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
-
-## Accessing current available implementations
-
-Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
-and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
-would expect from a usual Python dictionary:
-
-```python
->>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
-
->>> list(ALL_ATTENTION_FUNCTIONS.keys())
->>> ['flash_attention_2', 'flex_attention', 'sdpa']
-
->>> ALL_ATTENTION_FUNCTIONS["sdpa"]
->>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
-
->>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
->>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
-
-# You can also globally `register` a new function directly on it
->>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
-```
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -1,279 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Utilizing the @auto_docstring Decorator
-
-The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
-
---
-
-## 📜 How it Works
-
-The `@auto_docstring` decorator constructs docstrings by:
-
-1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
-2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
-3.  **Overriding or Adding Arguments Descriptions:**
-    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
-    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-4.  **Adding Classes and Functions Introduction:**
-    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
-    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
-5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
-6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
-7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
-8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
-
-
---
-
-## 🚀 How to Use @auto_docstring
-
-### 1. Importing the Decorator
-Import the decorator into your modeling file:
-
-```python
-from ...utils import auto_docstring
-```
-
-### 2. Applying to Classes
-Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
-
-```python
-from transformers.modeling_utils import PreTrainedModel
-from ...utils import auto_docstring
-
-@auto_docstring
-class MyAwesomeModel(PreTrainedModel):
-    def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
-        r"""
-        custom_parameter (`int`, *optional*, defaults to 10):
-            Description of the custom_parameter for MyAwesomeModel.
-        another_custom_arg (`str`, *optional*, defaults to "default"):
-            Documentation for another unique argument.
-        """
-        super().__init__(config)
-        self.custom_parameter = custom_parameter
-        self.another_custom_arg = another_custom_arg
-        # ... rest of your init
-
-    # ... other methods
-```
-
-#### Advanced Class Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control:
-
-```python
-@auto_docstring(
-    custom_intro="""This model performs specific synergistic operations.
-    It builds upon the standard Transformer architecture with unique modifications.""",
-    custom_args="""
-    custom_parameter (`type`, *optional*, defaults to `default_value`):
-        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
-    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
-    """
-)
-class MySpecialModel(PreTrainedModel):
-    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
-        # ...
-```
-
-Or:
-
-```python
-@auto_docstring(
-    custom_intro="""This model performs specific synergistic operations.
-    It builds upon the standard Transformer architecture with unique modifications.""",
-)
-class MySpecialModel(PreTrainedModel):
-    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
-        r"""
-        custom_parameter (`type`, *optional*, defaults to `default_value`):
-            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
-        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
-        """
-        # ...
-```
-
-### 3. Applying to Functions (e.g., `forward` method)
-Apply the decorator above method definitions, such as the `forward` method.
-
-```python
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        new_custom_argument: Optional[torch.Tensor] = None,
-        arg_documented_in_args_doc: Optional[torch.Tensor] = None,
-        # ... other arguments
-    ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
-        r"""
-        new_custom_argument (`torch.Tensor`, *optional*):
-            Description of this new custom argument and its expected shape or type.
-        """
-        # ...
-```
-
-#### Advanced Function Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
-
-```python
-MODEL_COMMON_CUSTOM_ARGS = r"""
-    common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
-        Description of common_arg_1
-    common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
-        Description of common_arg_2
-    ...
-"""
-
-class MyModel(PreTrainedModel):
-    # ...
-    @auto_docstring(
-        custom_intro="""
-        This is a custom introduction for the function.
-        """
-        custom_args=MODEL_COMMON_CUSTOM_ARGS
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        common_arg_1: Optional[torch.Tensor] = None,
-        common_arg_2: Optional[torch.Tensor] = None,
-        #...
-        function_specific_argument: Optional[torch.Tensor] = None,
-        # ... other arguments
-    ) -> torch.Tensor:
-        r"""
-        function_specific_argument (`torch.Tensor`, *optional*):
-            Description of an argument specific to this function
-
-        Returns:
-            `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
-
-        Example:
-
-        (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
-
-        ```python
-        ...
-        ```
-        """
-        # ...
-```
-
---
-
-### ✍️ Documenting Arguments: Approach & Priority
-
-1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
-    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
-
-2.  **New or Custom Arguments:**
-    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
-    * **Format:**
-        ```
-        argument_name (`type`, *optional*, defaults to `X`):
-            Description of the argument.
-            Explain its purpose, expected shape/type if complex, and default behavior.
-            This can span multiple lines.
-        ```
-    * Include `type` in backticks.
-    * Add "*optional*" if the argument is not required (has a default value).
-    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
-
-3.  **Overriding Standard Arguments:**
-    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
-    * The `labels` argument is often customized per model and typically requires a specific docstring.
-
-4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
-    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-
---
-
-### Usage with [modular files](./modular_transformers)
-
-When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
-
- **For standalone models in modular files:**
-  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
-
- **For models inheriting from other library models:**
-  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
-  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
-
-  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
-
-
-**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
-
---
-
-## ✅ Checking Your Docstrings with `check_auto_docstrings`
-
-The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
-
-#### What it Checks:
-
-* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
-* **Argument Completeness & Consistency:**
-    * Flags arguments in the signature that are not known standard arguments and lack a local description.
-    * Ensures documented arguments exist in the signature. (TODO)
-    * Verifies that types and default values in the docstring match the signature. (TODO)
-* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
-* **Formatting:** Adherence to the expected docstring style.
-
-#### Running the Check Locally:
-
-Run this check locally before committing. The common command is:
-
-```bash
-make fix-copies
-```
-
-Alternatively, to only perform docstrings and auto-docstring checks, you can use:
-
-```bash
-python utils/check_docstrings.py # to only check files included in the diff without fixing them
-# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
-# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
-```
-
-#### Workflow with the Checker:
-
-1.  Add `@auto_docstring(...)` to the class or method.
-2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
-3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
-    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
-4.  Manually edit these placeholders with accurate types and descriptions.
-5.  Re-run the check to ensure all issues are resolved.
-
---
-
-## 🔑 Key Takeaways & Best Practices
-
-* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
-* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
-* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
-* Document new or custom arguments clearly.
-* Run `check_docstrings` locally and iteratively.
-
-By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -181,6 +181,35 @@ processed_chat = processor.apply_chat_template(
 print(processed_chat.keys())
 ```

+</hfoption>
+<hfoption id="custom frame sampling">
+
+Some models don't sample frames *uniformly* and require more complex logic to determine which frames to use. For example, the model may have an *adaptive frame selection* or if the model prioritizes *key moments* in a video rather than evenly spaced frames.
+
+If a model has a different sampling strategy, you can write a function that customizes frame selection. The function should include the following requirements.
+
+- Use the `sample_indices_fn` parameter to pass a callable function for sampling.
+- If provided, this function *overrides* the standard `num_frames` and `fps` parameters.
+- The function receives all the parameters passed to `load_video` and must return valid frame indices to sample from.
+
+An example function is shown below. This gives you full control over frame selection, making the model more adaptable to different video scenarios.
+
+```py
+def sample_indices_fn(metadata, **kwargs):
+    # samples only the first and the second frame
+    return [0, 1]
+
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    sample_indices_fn=sample_indices_fn,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
 </hfoption>
 <hfoption id="list of image frames">

--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,28 +25,22 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers CLI
+## transformers-cli

-After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
+Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.

 ```bash
-transformers chat Qwen/Qwen2.5-0.5B-Instruct
+transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 ```

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
 </div>

-You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
-
-```bash
-transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
-```
-
 For a full list of options, run the command below.

 ```bash
-transformers chat -h
+transformers-cli chat -h
 ```

 The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
@ -82,16 +76,16 @@ print(response[0]["generated_text"][-1]["content"])
 (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
 some wild stuff, like that Warhol guy's soup cans and all that jazz.

-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@ -113,9 +107,9 @@ print(response[0]["generated_text"][-1]["content"])
 ```

 ```txt
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
 (sarcastically) Oh, yeah, real original, Andy.

 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -20,22 +20,18 @@ A decoding strategy informs how a model should select the next generated token.

 This guide will help you understand the different decoding strategies available in Transformers and how and when to use them.

-## Basic decoding methods
+## Greedy search

-These are well established decoding methods, and should be your starting point for text generation tasks.
+Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 tokens.

-### Greedy search
-
-Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 new tokens.
-
-Greedy search works well for tasks with relatively short outputs where creativity is not a priority. However, it breaks down when generating longer sequences because it begins to repeat itself.
+Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+inputs = tokenizer("I look forward to", return_tensors="pt").to("cuda")

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
 # explicitly set to default length because Llama2 generation length is 4096
@ -44,11 +40,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing'
 ```

-### Sampling
+## Contrastive search

-Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire model's vocabulary (as opposed to the most likely token, as in greedy search). This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.
+[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.

-Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
+Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.

 ```py
 import torch
@ -59,14 +55,14 @@ inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt"

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
 # explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
+outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
+'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
 ```

-### Beam search
+## Beam search

-Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability. It is best suited for input-grounded tasks, like describing an image or speech recognition. You can also use `do_sample=True` with beam search to sample at each step, but beam search will still greedily prune out low probability sequences between steps.
+Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability.

 > [!TIP]
 > Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works.
@ -87,11 +83,66 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 "['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']"
 ```

-## Advanced decoding methods
+## Diverse beam search

-Advanced decoding methods aim at either tackling specific generation quality issues (e.g. repetition) or at improving the generation throughput in certain situations. These techniques are more complex, and may not work correctly with all models.
+[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.

-### Speculative decoding
+Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
+```
+
+## Multinomial sampling
+
+Search methods selects the most likely tokens. Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire models vocabulary. This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.
+
+Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
+```
+
+## Beam search multinomial sampling
+
+This decoding strategy is a combination of beam search and multinomial sampling. It generates multiple beams and uses a sampling strategy for each beam.
+
+Enable beam search multinomial sampling by setting `num_beams` to a value greater than 1 and `do_sample=True`.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=4)
+'Hugging Face is an open-source company 100% dedicated to making AI more accessible. We believe that AI should be available to everyone, and we’re working hard to make that a reality.\nWe’re a team of passionate engineers, designers,'
+```
+
+## Speculative decoding

 [Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more.

@ -152,7 +203,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 </hfoption>
 </hfoptions>

-#### Prompt lookup decoding
+### Prompt lookup decoding

 [Prompt lookup decoding](./llm_optims#prompt-lookup-decoding) is a variant of speculative decoding that uses overlapping n-grams as the candidate tokens. It works well for input-grounded tasks such as summarization. Refer to the [prompt lookup decoding](./llm_optims#prompt-lookup-decoding) guide to learn more.

@ -194,7 +245,7 @@ outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ```

-#### Universal assisted decoding
+### Universal assisted decoding

 Universal assisted decoding (UAD) enables the main and assistant models to use different tokenizers. The main models input tokens are re-encoded into assistant model tokens. Candidate tokens are generated in the assistant encoding which are re-encoded into the main model candidate tokens. The candidate tokens are verified as explained in [speculative decoding](#speculative-decoding).

@ -218,27 +269,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

-### Contrastive search
-
-[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
-
-Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
-```
-
-### DoLa
+## DoLa

 [Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit differences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2.

@ -294,210 +325,6 @@ tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tok
 </hfoption>
 </hfoptions>

-### Diverse beam search
-
-[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
-
-Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
-```
-
-
-## Custom decoding methods
-
-Custom decoding methods enable specialized generation behavior such as the following:
- have the model continue thinking if it is uncertain;
- roll back generation if the model gets stuck;
- handle special tokens with custom logic;
- enhanced input preparation for advanced models;
-
-We enable custom decoding methods through model repositories, assuming a specific model tag and file structure (see subsection below). This feature is an extension of [custom modeling code](./models.md#custom-models) and, like such, requires setting `trust_remote_code=True`.
-
-If a model repository holds a custom decoding method, the easiest way to try it out is to load the model and generate with it:
-
-<!-- TODO before merging: 1) better repo name (use a `generate-community` org?) 2) prettify the repo -->
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-# `transformers-community/custom_generate_example` holds a copy of `Qwen/Qwen2.5-0.5B-Instruct`, but
-# with custom generation code -> calling `generate` uses the custom decoding method!
-tokenizer = AutoTokenizer.from_pretrained("transformers-community/custom_generate_example")
-model = AutoModelForCausalLM.from_pretrained(
-    "transformers-community/custom_generate_example", device_map="auto", trust_remote_code=True
-)
-
-inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
-# The custom decoding method is a minimal greedy decoding implementation. It also prints a custom message at run time.
-gen_out = model.generate(**inputs)
-# you should now see its custom message, "✨ using a custom generation method ✨"
-print(tokenizer.batch_decode(gen_out, skip_special_tokens=True))
-'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
-```
-
-Model repositories with custom decoding methods have a special property: their decoding method can be loaded from **any** model through [`~GenerationMixin.generate`]'s `custom_generate` argument. This means anyone can create and share their custom generation method to potentially work with any Transformers model, without requiring users to install additional Python packages.
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
-
-inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
-# `custom_generate` replaces the original `generate` by the custom decoding method defined in
-# `transformers-community/custom_generate_example`
-gen_out = model.generate(**inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True)
-print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
-'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
-```
-
-You should read the `README.md` file of the repository containing the custom generation strategy to see what the new arguments and output type differences are, if they exist. Otherwise, you can assume it works like the base [`~GenerationMixin.generate`] method.
-
-> [!TIP]
-> You can find all custom decoding methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`
-
-Consider the Hub repository [transformers-community/custom_generate_example](https://huggingface.co/transformers-community/custom_generate_example) as an example. The `README.md` states that it has an additional input argument, `left_padding`, which adds a number of padding tokens before the prompt.
-
-```py
-gen_out = model.generate(
-    **inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True, left_padding=5
-)
-print(tokenizer.batch_decode(gen_out)[0])
-'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>The quick brown fox jumps over the lazy dog.\n\nThe sentence "The quick'
-```
-
-If the custom method has pinned Python requirements that your environment doesn't meet, you'll get an exception about missing requirements. For instance, [transformers-community/custom_generate_bad_requirements](https://huggingface.co/transformers-community/custom_generate_bad_requirements) has an impossible set of requirements defined in its `custom_generate/requirements.txt` file, and you'll see the error message below if you try to run it.
-
-```
-ImportError: Missing requirements in your local environment for `transformers-community/custom_generate_bad_requirements`:
-foo (installed: None)
-bar==0.0.0 (installed: None)
-torch>=99.0 (installed: 2.6.0)
-```
-
-Updating your Python requirements accordingly will remove this error message.
-
-### Creating a custom decoding method
-
-To create a new decoding method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
-1. The model you've designed your decoding method with.
-2. `custom_generate/generate.py`, which contains all the logic for your custom decoding method.
-3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method.
-4. `README.md`, where you should add the `custom_generate` tag and document any new arguments or output type differences of your custom method here.
-
-After you've added all required files, your repository should look like this
-
-```
-your_repo/
-├── README.md          # include the 'custom_generate' tag
-├── config.json
-├── ...
-└── custom_generate/
-    ├── generate.py
-    └── requirements.txt
-```
-
-#### Adding the base model
-
-The starting point for your custom decoding method is a model repository just like any other. The model to add to this repository should be the model you've designed your method with, and it is meant to be part of a working self-contained model-generate pair. When the model in this repository is loaded, your custom decoding method will override `generate`. Don't worry -- your decoding method can still be loaded with any other Transformers model, as explained in the section above.
-
-If you simply want to copy an existing model, you can do
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("source/model_repo")
-model = AutoModelForCausalLM.from_pretrained("source/model_repo")
-tokenizer.save_pretrained("your/decoding_method", push_to_hub=True)
-model.save_pretrained("your/decoding_method", push_to_hub=True)
-```
-
-#### generate.py
-
-This is the core of your decoding method. It *must* contain a method named `generate`, and this method *must* contain a `model` argument as its first argument. `model` is the model instance, which means you have access to all attributes and methods in the model, including the ones defined in [`GenerationMixin`] (like the base `generate` method).
-
-> [!WARNING]
-> `generate.py` must be placed in a folder named `custom_generate`, and not at the root level of the repository. The file paths for this feature are hardcoded.
-
-Under the hood, when the base [`~GenerationMixin.generate`] method is called with a `custom_generate` argument, it first checks its Python requirements (if any), then locates the custom `generate` method in `generate.py`, and finally calls the custom `generate`. All received arguments and `model` are forwarded to your custom `generate` method.
-
-This means your `generate` can have a mix of original and custom arguments (as well as a different output type) as shown below.
-
-```py
-import torch
-
-def generate(model, input_ids, generation_config=None, left_padding=None, **kwargs):
-    generation_config = generation_config or model.generation_config  # default to the model generation config
-    cur_length = input_ids.shape[1]
-    max_length = generation_config.max_length or cur_length + generation_config.max_new_tokens
-
-    # Example of custom argument: add `left_padding` (integer) pad tokens before the prompt
-    if left_padding is not None:
-        if not isinstance(left_padding, int) or left_padding < 0:
-            raise ValueError(f"left_padding must be an integer larger than 0, but is {left_padding}")
-
-        pad_token = kwargs.pop("pad_token", None) or generation_config.pad_token_id or model.config.pad_token_id
-        if pad_token is None:
-            raise ValueError("pad_token is not defined")
-        batch_size = input_ids.shape[0]
-        pad_tensor = torch.full(size=(batch_size, left_padding), fill_value=pad_token).to(input_ids.device)
-        input_ids = torch.cat((pad_tensor, input_ids), dim=1)
-        cur_length = input_ids.shape[1]
-
-    # Simple greedy decoding loop
-    while cur_length < max_length:
-        logits = model(input_ids).logits
-        next_token_logits = logits[:, -1, :]
-        next_tokens = torch.argmax(next_token_logits, dim=-1)
-        input_ids = torch.cat((input_ids, next_tokens[:, None]), dim=-1)
-        cur_length += 1
-
-    return input_ids
-```
-
-Follow the recommended practices below to ensure your custom decoding method works as expected.
- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
- You can add other files in the `custom_generate` folder, and use relative imports.
- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
-
-#### requirements.txt
-
-You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
-
-#### README.md
-
-The root level `README.md` in the model repository usually describes the model therein. However, since the focus of the repository is the custom decoding method, we highly recommend to shift its focus towards describing the custom decoding method. In addition to a description of the method, we recommend documenting any input and/or output differences to the original [`~GenerationMixin.generate`]. This way, users can focus on what's new, and rely on Transformers docs for generic implementation details.
-
-For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository!
-
-```
---
-library_name: transformers
-tags:
-  - custom_generate
---
-
-(your markdown content here)
-```
-
-Recommended practices:
- Document input and output differences in [`~GenerationMixin.generate`].
- Add self-contained examples to enable quick experimentation.
- Describe soft-requirements such as if the method only works well with a certain family of models.
-
-
 ## Resources

 Read the [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) blog post for an explanation of how common decoding strategies work.
--- a/docs/source/en/how_to_hack_models.md
+++ b/docs/source/en/how_to_hack_models.md
@ -90,6 +90,11 @@ class SamVisionAttentionSplit(SamVisionAttention, nn.Module):

        attn_weights = (query * self.scale) @ key.transpose(-2, -1)

+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
@ -109,14 +114,13 @@ Load the model with [`~PreTrainedModel.from_pretrained`].

 ```py
 from transformers import SamModel
+from transformers.models.sam import modeling_sam
+
+# replace the attention class in the modeling_sam module
+modeling_sam.SamVisionAttention = SamVisionAttentionSplit

 # load the pretrained SAM model
 model = SamModel.from_pretrained("facebook/sam-vit-base")
-
-# replace the attention class in the vision_encoder module
-for layer in model.vision_encoder.layers:
-    if hasattr(layer, "attn"):
-        layer.attn = SamVisionAttentionSplit(model.config.vision_config, model.config.vision_config.window_size)
 ```

 ## LoRA
@ -134,7 +138,7 @@ config = LoraConfig(
    # apply LoRA to q and v
    target_modules=["q", "v"],
    lora_dropout=0.1,
-    task_type="FEATURE_EXTRACTION"
+    task_type="mask-generation"
 )
 ```

@ -148,5 +152,5 @@ Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_refer

 ```py
 model.print_trainable_parameters()
-"trainable params: 589,824 || all params: 94,274,096 || trainable%: 0.6256"
+"trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447"
 ```
--- a/docs/source/en/image_processors.md
+++ b/docs/source/en/image_processors.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Image processors

-Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
+Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.

 - [`~BaseImageProcessor.center_crop`] to resize an image
 - [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -43,3 +43,4 @@ Transformers is designed for developers and machine learning engineers and resea
  </a>
 </div>

+Join us on the Hugging Face [Hub](https://huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb), or [forum](https://discuss.huggingface.co/) to collaborate and build models, datasets, and applications together.
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.

 # Installation

-Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.9+, PyTorch 2.1+, TensorFlow 2.6+, and Flax 0.4.1+.
+Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.9+, PyTorch 2.0+, TensorFlow 2.6+, and Flax 0.4.1+.

 ## Virtual environment

--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@ -1,104 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Import Utilities
-
-This page goes through the transformers utilities to enable lazy and fast object import.
-While we strive for minimal dependencies, some models have specific dependencies requirements that cannot be
-worked around. We don't want for all users of `transformers` to have to install those dependencies to use other models,
-we therefore mark those as soft dependencies rather than hard dependencies.
-
-The transformers toolkit is not made to error-out on import of a model that has a specific dependency; instead, an
-object for which you are lacking a dependency will error-out when calling any method on it. As an example, if 
-`torchvision` isn't installed, the fast image processors will not be available. 
-
-This object is still importable:
-
-```python
->>> from transformers import DetrImageProcessorFast
->>> print(DetrImageProcessorFast)
-<class 'DetrImageProcessorFast'>
-```
-
-However, no method can be called on that object:
-
-```python
->>> DetrImageProcessorFast.from_pretrained()
-ImportError: 
-DetrImageProcessorFast requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
-installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
-Please note that you may need to restart your runtime after installation.
-```
-
-Let's see how to specify specific object dependencies.
-
-## Specifying Object Dependencies
-
-### Filename-based
-
-All objects under a given filename have an automatic dependency to the tool linked to the filename
-
-**TensorFlow**: All files starting with `modeling_tf_` have an automatic TensorFlow dependency.
-
-**Flax**: All files starting with `modeling_flax_` have an automatic Flax dependency
-
-**PyTorch**: All files starting with `modeling_` and not valid with the above (TensorFlow and Flax) have an automatic 
-PyTorch dependency
-
-**Tokenizers**: All files starting with `tokenization_` and ending with `_fast` have an automatic `tokenizers` dependency
-
-**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group; 
-at the time of writing, this only contains the `pillow` dependency.
-
-**Vision + Torch + Torchvision**: All files starting with `image_processing_` and ending with `_fast` have an automatic
-dependency to `vision`, `torch`, and `torchvision`.
-
-All of these automatic dependencies are added on top of the explicit dependencies that are detailed below.
-
-### Explicit Object Dependencies
-
-We add a method called `requires` that is used to explicitly specify the dependencies of a given object. As an
-example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these 
-required dependencies:
-
-```python
-from .utils.import_utils import requires
-
-@requires(backends=("torch", "accelerate"))
-class Trainer:
-    ...
-```
-
-Backends that can be added here are all the backends that are available in the `import_utils.py` module.
-
-Additionally, specific versions can be specified in each backend. For example, this is how you would specify
-a requirement on torch>=2.6 on the `Trainer` class:
-
-```python
-from .utils.import_utils import requires
-
-@requires(backends=("torch>=2.6", "accelerate"))
-class Trainer:
-    ...
-```
-
-You can specify the following operators: `==`, `>`, `>=`, `<`, `<=`, `!=`.
-
-## Methods
-
-[[autodoc]] utils.import_utils.define_import_structure
-
-[[autodoc]] utils.import_utils.requires
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -28,7 +28,7 @@ Most of those are only useful if you are adding new models in the library.

 This context manager is a power user tool intended for model adders.
 It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
-To note, this context manager enforces `torch.no_grad()`.
+To note, this context manager enforces `torch.inference_mode()`.

 ### Rationale

@ -43,7 +43,6 @@ import torch
 from PIL import Image
 import requests
 from transformers import LlavaProcessor, LlavaForConditionalGeneration
-from transformers.model_debugging_utils import model_addition_debugger_context
 torch.random.manual_seed(673)

 # load pretrained model and processor
@ -61,153 +60,12 @@ prompt = "<image>Describe this image."
 inputs = processor(text=prompt, images=random_image, return_tensors="pt")

 # call forward method (not .generate!)
-with model_addition_debugger_context(
-  model,
-  debug_path="optional_path_to_your_directory",
-  do_prune_layers=False # This will output ALL the layers of a model.
-  ):
+with model_addition_debugger_context(model, "optional_path_to_your_output_file.json"):
    output = model.forward(**inputs)

 ```


-### Reading results
-
-The debugger generates two files from the forward call, both with the same base name, 
-but ending either with `_SUMMARY.json` or with `_FULL_TENSORS.json`. 
-
-The first one will contain a summary of each module's _input_ and _output_ tensor values and shapes.
-
-```json
-{
-  "module_path": "MolmoForConditionalGeneration",
-  "inputs": {
-    "args": [],
-    "kwargs": {
-      "input_ids": {
-        "shape": "torch.Size([1, 589])",
-        "dtype": "torch.int64"
-      },
-      "attention_mask": {
-        "shape": "torch.Size([1, 589])",
-        "dtype": "torch.int64"
-      },
-      "pixel_values": {
-        "shape": "torch.Size([1, 5, 576, 588])",
-        "dtype": "torch.float32",
-        "mean": "tensor(-8.9514e-01, device='cuda:0')",
-        "std": "tensor(9.2586e-01, device='cuda:0')",
-        "min": "tensor(-1.7923e+00, device='cuda:0')",
-        "max": "tensor(1.8899e+00, device='cuda:0')"
-    }
-  },
-  "children": [
-    {
-      "module_path": "MolmoForConditionalGeneration.language_model.model.embed_tokens",
-      "inputs": {
-        "args": [
-          {
-            "shape": "torch.Size([1, 589])",
-            "dtype": "torch.int64"
-          }
-        ]
-      },
-      "outputs": {
-        "shape": "torch.Size([1, 589, 3584])",
-        "dtype": "torch.float32",
-        "mean": "tensor(6.5460e-06, device='cuda:0')",
-        "std": "tensor(2.3807e-02, device='cuda:0')",
-        "min": "tensor(-3.3398e-01, device='cuda:0')",
-        "max": "tensor(3.9453e-01, device='cuda:0')"
-      }
-    },
-    {
-      "module_path": "MolmoForConditionalGeneration.vision_tower",
-      "inputs": {
-        "args": [
-          {
-            "shape": "torch.Size([5, 1, 576, 588])",
-            "dtype": "torch.float32",
-            "mean": "tensor(-8.9514e-01, device='cuda:0')",
-            "std": "tensor(9.2586e-01, device='cuda:0')",
-            "min": "tensor(-1.7923e+00, device='cuda:0')",
-            "max": "tensor(1.8899e+00, device='cuda:0')"
-          }
-        ],
-        "kwargs": {
-          "output_hidden_states": "True"
-        }
-      },
-      "children": [
-        { ... and so on
-```
-
-The `_FULL_TENSORS.json` file will display a full view of all tensors, which is useful
-for comparing two files. 
-```json
-      "pixel_values": {
-        "shape": "torch.Size([1, 5, 576, 588])",
-        "dtype": "torch.float32",
-        "value": [
-          "tensor([[[[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          ...,",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]],",
-          "",
-          "         [[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          ...,",
-          "          [-1.4857e+00, -1.4820e+00, -1.2100e+00,  ..., -6.0979e-01, -5.9650e-01, -3.8527e-01],",
-          "          [-1.6755e+00, -1.7221e+00, -1.4518e+00,  ..., -7.5577e-01, -7.4658e-01, -5.5592e-01],",
-          "          [-7.9957e-01, -8.2162e-01, -5.7014e-01,  ..., -1.3689e+00, -1.3169e+00, -1.0678e+00]],",
-          "",
-          "         [[-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          ...,",
-          "          [-3.0322e-01, -5.0645e-01, -5.8436e-01,  ..., -6.2439e-01, -7.9160e-01, -8.1188e-01],",
-          "          [-4.4921e-01, -6.5653e-01, -7.2656e-01,  ..., -3.4702e-01, -5.2146e-01, -5.1326e-01],",
-          "          [-3.4702e-01, -5.3647e-01, -5.4170e-01,  ..., -1.0915e+00, -1.1968e+00, -1.0252e+00]],",
-          "",
-          "         [[-1.1207e+00, -1.2718e+00, -1.0678e+00,  ..., 1.2013e-01, -1.3126e-01, -1.7197e-01],",
-          "          [-6.9738e-01, -9.1166e-01, -8.5454e-01,  ..., -5.5050e-02, -2.8134e-01, -4.2793e-01],",
-          "          [-3.4702e-01, -5.5148e-01, -5.8436e-01,  ..., 1.9312e-01, -8.6235e-02, -2.1463e-01],",
-          "          ...,",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]],",
-          "",
-          "         [[-1.0039e+00, -9.5669e-01, -6.5546e-01,  ..., -1.4711e+00, -1.4219e+00, -1.1389e+00],",
-          "          [-1.0039e+00, -9.5669e-01, -6.5546e-01,  ..., -1.7193e+00, -1.6771e+00, -1.4091e+00],",
-          "          [-1.6317e+00, -1.6020e+00, -1.2669e+00,  ..., -1.2667e+00, -1.2268e+00, -8.9720e-01],",
-          "          ...,",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00],",
-          "          [-1.7923e+00, -1.7521e+00, -1.4802e+00,  ..., -1.7923e+00, -1.7521e+00, -1.4802e+00]]]], device='cuda:0')"
-        ],
-        "mean": "tensor(-8.9514e-01, device='cuda:0')",
-        "std": "tensor(9.2586e-01, device='cuda:0')",
-        "min": "tensor(-1.7923e+00, device='cuda:0')",
-        "max": "tensor(1.8899e+00, device='cuda:0')"
-      },
-```
-
-### Comparing between implementations
-
-Once the forward passes of two models have been traced by the debugger, one can compare the `json` output files. See below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong. 
-
-
-![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/files_difference_debugging.png)
-
-
-### Limitations and scope
-
-This feature will only work for torch-based models, and would require more work and case-by-case approach for say `jax`-based models that are usually compiled. Models relying heavily on external kernel calls may work, but trace will probably miss some things. Regardless, any python implementation that aims at mimicking another implementation can be traced once instead of reran N times with breakpoints.
-
-If you pass `do_prune_layers=False` to your model debugger, ALL the layers will be outputted to `json`. Else, only the first and last layer will be shown. This is useful when some layers (typically cross-attention) appear only after N layers. 
+[[autodoc]] model_addition_debugger

 [[autodoc]] model_addition_debugger_context
--- a/docs/source/en/internal/modeling_utils.md
+++ b/docs/source/en/internal/modeling_utils.md
@ -16,27 +16,32 @@ rendered properly in your Markdown viewer.

 # Custom Layers and Utilities

-This page lists all the custom layers used by the library, as well as the utility functions and classes it provides for modeling.
+This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.

 Most of those are only useful if you are studying the code of the models in the library.

-## Layers
-
-[[autodoc]] GradientCheckpointingLayer
-
-## Attention Functions
-
-[[autodoc]] AttentionInterface
-    - register
-
-## Rotary Position Embedding Functions
-
-[[autodoc]] dynamic_rope_update

 ## Pytorch custom modules

 [[autodoc]] pytorch_utils.Conv1D

+[[autodoc]] modeling_utils.PoolerStartLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerEndLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerAnswerClass
+    - forward
+
+[[autodoc]] modeling_utils.SquadHeadOutput
+
+[[autodoc]] modeling_utils.SQuADHead
+    - forward
+
+[[autodoc]] modeling_utils.SequenceSummary
+    - forward
+
 ## PyTorch Helper Functions

 [[autodoc]] pytorch_utils.apply_chunking_to_forward
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 The key-value (KV) vectors are used to calculate attention scores. For autoregressive models, KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time.

-A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation) doc for a more detailed explanation about how a cache works.
+A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation.md) doc for a more detailed explanation about how a cache works.

 Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case.

--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -93,7 +93,7 @@ model.generation_config.max_new_tokens = 16

 past_key_values = StaticCache(
    config=model.config,
-    max_batch_size=1,
+    batch_size=1,
    # If you plan to reuse the cache, make sure the cache length is large enough for all cases
    max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
    device=model.device,
@ -159,7 +159,7 @@ from torch.nn.attention import SDPBackend, sdpa_kernel
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
    past_key_values = StaticCache(
-        config=model.config, max_batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
+        config=model.config, batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
    )
    cache_position = torch.arange(seq_length, device=torch_device)
    generated_ids = torch.zeros(
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -20,13 +20,9 @@ rendered properly in your Markdown viewer.

 Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token.

-In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
+In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities.

-> [!TIP]
-> You can also chat with a model directly from the command line. ([reference](./conversations.md#transformers-cli))
-> ```shell
-> transformers chat Qwen/Qwen2.5-0.5B-Instruct
-> ```
+This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.

 ## Default generate

@ -138,20 +134,6 @@ outputs = model.generate(**inputs, generation_config=generation_config)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ```

-## Common Options
-
-[`~GenerationMixin.generate`] is a powerful tool that can be heavily customized. This can be daunting for a new users. This section contains a list of popular generation options that you can define in most text generation tools in Transformers: [`~GenerationMixin.generate`], [`GenerationConfig`], `pipelines`, the `chat` CLI, ...
-
-| Option name | Type | Simplified description |
-|---|---|---|
-| `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
-| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
-| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
-| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
-| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
-| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
-
-
 ## Pitfalls

 The section below covers some common issues you may encounter during text generation and how to solve them.
@ -304,4 +286,4 @@ Take a look below for some more specific and specialized text generation librari
 - [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python).
 - [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs.
 - [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation.
- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
+- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@ -0,0 +1,167 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Agents & Tools
+
+<Tip warning={true}>
+
+Transformers Agents is an experimental API which is subject to change at any time. Results returned by the agents
+can vary as the APIs or underlying models are prone to change.
+
+</Tip>
+
+To learn more about agents and tools make sure to read the [introductory guide](../transformers_agents). This page
+contains the API docs for the underlying classes.
+
+## Agents
+
+We provide two types of agents, based on the main [`Agent`] class:
+- [`CodeAgent`] acts in one shot, generating code to solve the task, then executes it at once.
+- [`ReactAgent`] acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes:
+  - [`ReactJsonAgent`] writes its tool calls in JSON.
+  - [`ReactCodeAgent`] writes its tool calls in Python code.
+
+### Agent
+
+[[autodoc]] Agent
+
+### CodeAgent
+
+[[autodoc]] CodeAgent
+
+### React agents
+
+[[autodoc]] ReactAgent
+
+[[autodoc]] ReactJsonAgent
+
+[[autodoc]] ReactCodeAgent
+
+### ManagedAgent
+
+[[autodoc]] ManagedAgent
+
+## Tools
+
+### load_tool
+
+[[autodoc]] load_tool
+
+### tool
+
+[[autodoc]] tool
+
+### Tool
+
+[[autodoc]] Tool
+
+### Toolbox
+
+[[autodoc]] Toolbox
+
+### PipelineTool
+
+[[autodoc]] PipelineTool
+
+### launch_gradio_demo
+
+[[autodoc]] launch_gradio_demo
+
+### stream_to_gradio
+
+[[autodoc]] stream_to_gradio
+
+### ToolCollection
+
+[[autodoc]] ToolCollection
+
+## Engines
+
+You're free to create and use your own engines to be usable by the Agents framework.
+These engines have the following specification:
+1. Follow the [messages format](../chat_templating.md) for its input (`List[Dict[str, str]]`) and return a string.
+2. Stop generating outputs *before* the sequences passed in the argument `stop_sequences`
+
+### TransformersEngine
+
+For convenience, we have added a `TransformersEngine` that implements the points above, taking a pre-initialized `Pipeline` as input.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine
+
+>>> model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+>>> model = AutoModelForCausalLM.from_pretrained(model_name)
+
+>>> pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+>>> engine = TransformersEngine(pipe)
+>>> engine([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])
+
+"What a "
+```
+
+[[autodoc]] TransformersEngine
+
+### HfApiEngine
+
+The `HfApiEngine` is an engine that wraps an [HF Inference API](https://huggingface.co/docs/api-inference/index) client for the execution of the LLM.
+
+```python
+>>> from transformers import HfApiEngine
+
+>>> messages = [
+...   {"role": "user", "content": "Hello, how are you?"},
+...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...   {"role": "user", "content": "No need to help, take it easy."},
+... ]
+
+>>> HfApiEngine()(messages, stop_sequences=["conversation"])
+
+"That's very kind of you to say! It's always nice to have a relaxed "
+```
+
+[[autodoc]] HfApiEngine
+
+
+## Agent Types
+
+Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return
+text, image, audio, video, among other types. In order to increase compatibility between tools, as well as to 
+correctly render these returns in ipython (jupyter, colab, ipython notebooks, ...), we implement wrapper classes
+around these types.
+
+The wrapped objects should continue behaving as initially; a text object should still behave as a string, an image
+object should still behave as a `PIL.Image`.
+
+These types have three specific purposes:
+
+- Calling `to_raw` on the type should return the underlying object
+- Calling `to_string` on the type should return the object as a string: that can be the string in case of an `AgentText`
+  but will be the path of the serialized version of the object in other instances
+- Displaying it in an ipython kernel should display the object correctly
+
+### AgentText
+
+[[autodoc]] transformers.agents.agent_types.AgentText
+
+### AgentImage
+
+[[autodoc]] transformers.agents.agent_types.AgentImage
+
+### AgentAudio
+
+[[autodoc]] transformers.agents.agent_types.AgentAudio
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -77,9 +77,9 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] TorchAoConfig

-## BitNetQuantConfig
+## BitNetConfig

-[[autodoc]] BitNetQuantConfig
+[[autodoc]] BitNetConfig

 ## SpQRConfig

@ -92,7 +92,3 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## QuarkConfig

 [[autodoc]] QuarkConfig
-
-## AutoRoundConfig
-
-[[autodoc]] AutoRoundConfig
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -1,55 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-
-# Video Processor
-
-A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
-
-The video processor extends the functionality of image processors by allowing Vision Large Language Models (VLMs) to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
-
-When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't upadted your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
-
-
-### Usage Example
-Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
-
-```python
-from transformers import AutoVideoProcessor
-
-processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
-```
-
-Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
-
-Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
-
-```python
-import torch
-from transformers.video_utils import load_video
-from transformers import AutoVideoProcessor
-
-video = load_video("video.mp4")
-processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
-processor = torch.compile(processor)
-processed_video = processor(video, return_tensors="pt")
-```
-
-
-## BaseVideoProcessor
-
-[[autodoc]] video_processing_utils.BaseVideoProcessor
-
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@ -57,7 +57,6 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 - Embedding size E is different from hidden size H justified because the embeddings are context independent (one embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V being the vocab size). If E < H, it has less parameters.
 - Layers are split in groups that share parameters (to save memory).
 Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -102,10 +102,6 @@ response = processor.decode(output_ids, skip_special_tokens=True)

 [[autodoc]] AriaTextModel

-## AriaModel
-
-[[autodoc]] AriaModel
-
 ## AriaTextForCausalLM

 [[autodoc]] AriaTextForCausalLM
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -74,10 +74,6 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its

 [[autodoc]] AutoImageProcessor

-## AutoVideoProcessor
-
-[[autodoc]] AutoVideoProcessor
-
 ## AutoProcessor

 [[autodoc]] AutoProcessor
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -237,10 +237,6 @@ for i, output in enumerate(batch_outputs):

 [[autodoc]] AyaVisionConfig

-## AyaVisionModel
-
-[[autodoc]] AyaVisionModel
-
 ## AyaVisionForConditionalGeneration

 [[autodoc]] AyaVisionForConditionalGeneration
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@ -55,7 +55,6 @@ This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The
  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
  * permute sentences
  * rotate the document to make it start at a specific token
- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ## Implementation Notes

--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -151,12 +151,6 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

-## BeitImageProcessorFast
-
-[[autodoc]] BeitImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@ -81,10 +81,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model google-bert/bert-base-uncased --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
 ```

 </hfoption>
@ -256,4 +256,4 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran

 [[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput

-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@ -36,7 +36,6 @@ This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The
 - BioGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left.
 - BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
 - The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/bit.md
+++ b/docs/source/en/model_doc/bit.md
@ -58,11 +58,6 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] BitImageProcessor
    - preprocess

-## BitImageProcessorFast
-
-[[autodoc]] BitImageProcessorFast
-    - preprocess
-
 ## BitModel

 [[autodoc]] BitModel
--- a/docs/source/en/model_doc/bitnet.md
+++ b/docs/source/en/model_doc/bitnet.md
@ -1,121 +0,0 @@
-<!--Copyright 2025 The BitNet Team and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# BitNet
-
-## Overview
-
-Trained on a corpus of 4 trillion tokens, this model demonstrates that native 1-bit LLMs can achieve performance comparable to leading open-weight, full-precision models of similar size, while offering substantial advantages in computational efficiency (memory, energy, latency).
-
-➡️ **Technical Report:** [BitNet b1.58 2B4T Technical Report](https://arxiv.org/abs/2504.12285)
-
-➡️ **Official Inference Code:** [microsoft/BitNet (bitnet.cpp)](https://github.com/microsoft/BitNet)
-
-## Model Variants
-
-Several versions of the model weights are available on Hugging Face:
-
-* [**`microsoft/bitnet-b1.58-2B-4T`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T): Contains the packed 1.58-bit weights optimized for efficient inference. **Use this for deployment.**
-
-* [**`microsoft/bitnet-b1.58-2B-4T-bf16`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-bf16): Contains the master weights in BF16 format. **Use this only for training or fine-tuning purposes.**
-
-* [**`microsoft/bitnet-b1.58-2B-4T-gguf`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-gguf): Contains the model weights in GGUF format, compatible with the `bitnet.cpp` library for CPU inference.
-
-
-### Model Details
-
-
-* **Architecture:** Transformer-based, modified with `BitLinear` layers (BitNet framework).
-    * Uses Rotary Position Embeddings (RoPE).
-    * Uses squared ReLU (ReLU²) activation in FFN layers.
-    * Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
-    * No bias terms in linear or normalization layers.
-* **Quantization:** Native 1.58-bit weights and 8-bit activations (W1.58A8).
-    * Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
-    * Activations are quantized to 8-bit integers using absmax quantization (per-token).
-    * **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
-* **Parameters:** ~2 Billion
-* **Training Tokens:** 4 Trillion
-*   **Context Length:** Maximum sequence length of **4096 tokens**.
-    *   *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
-* **Training Stages:**
-    1.  **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
-    2.  **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
-    3.  **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
-* **Tokenizer:** LLaMA 3 Tokenizer (vocab size: 128,256).
-
-
-## Usage tips
-
-
-**VERY IMPORTANT NOTE ON EFFICIENCY**
-
-> Please do NOT expect performance efficiency gains (in terms of speed, latency, or energy consumption) when using this model with the standard transformers library.
->
-> The current execution paths within transformers do not contain the specialized, highly optimized computational kernels required to leverage the advantages of the BitNet architecture. Running the model via transformers will likely result in inference speeds and energy usage comparable to, or potentially worse than, standard full-precision models within this framework on both CPU and GPU.
->
-> While you might observe reduced memory usage due to the quantized weights, the primary computational efficiency benefits are not accessible through this standard transformers usage path.
->
-> For achieving the efficiency benefits demonstrated in the technical paper, you MUST use the dedicated C++ implementation: [bitnet.cpp](https://github.com/microsoft/BitNet).
-
-### Requirements
-
-```bash
-pip install transformers
-```
-
-### Example
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "microsoft/bitnet-b1.58-2B-4T"
-
-# Load tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16
-)
-
-# Apply the chat template
-messages = [
-    {"role": "system", "content": "You are a helpful AI assistant."},
-    {"role": "user", "content": "How are you?"},
-]
-chat_input = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
-
-# Generate response
-chat_outputs = model.generate(chat_input, max_new_tokens=50)
-response = tokenizer.decode(chat_outputs[0][chat_input.shape[-1]:], skip_special_tokens=True) # Decode only the response part
-print("\nAssistant Response:", response)
-```
-
-
-## BitNetConfig
-
-[[autodoc]] BitNetConfig
-
-## BitNetModel
-
-[[autodoc]] BitNetModel
-    - forward
-
-## BitNetForCausalLM
-
-[[autodoc]] BitNetForCausalLM
-    - forward
--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@ -88,11 +88,6 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] BlipTextModel
    - forward

-## BlipTextLMHeadModel
-
-[[autodoc]] BlipTextLMHeadModel
- forward
-
 ## BlipVisionModel

 [[autodoc]] BlipVisionModel
@ -128,11 +123,6 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] TFBlipTextModel
    - call

-## TFBlipTextLMHeadModel
-
-[[autodoc]] TFBlipTextLMHeadModel
- forward
-
 ## TFBlipVisionModel

 [[autodoc]] TFBlipVisionModel
--- a/docs/source/en/model_doc/bridgetower.md
+++ b/docs/source/en/model_doc/bridgetower.md
@ -147,11 +147,6 @@ Tips:
 [[autodoc]] BridgeTowerImageProcessor
    - preprocess

-## BridgeTowerImageProcessorFast
-
-[[autodoc]] BridgeTowerImageProcessorFast
-    - preprocess
-
 ## BridgeTowerProcessor

 [[autodoc]] BridgeTowerProcessor
--- a/docs/source/en/model_doc/chinese_clip.md
+++ b/docs/source/en/model_doc/chinese_clip.md
@ -90,11 +90,6 @@ Currently, following scales of pretrained Chinese-CLIP models are available on
 [[autodoc]] ChineseCLIPImageProcessor
    - preprocess

-## ChineseCLIPImageProcessorFast
-
-[[autodoc]] ChineseCLIPImageProcessorFast
-    - preprocess
-
 ## ChineseCLIPFeatureExtractor

 [[autodoc]] ChineseCLIPFeatureExtractor
--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@ -14,77 +14,221 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
 # CLIP

-[CLIP](https://huggingface.co/papers/2103.00020) is a is a multimodal vision and language model motivated by overcoming the fixed number of object categories when training a computer vision model. CLIP learns about images directly from raw text by jointly training on 400M (image, text) pairs. Pretraining on this scale enables zero-shot transfer to downstream tasks. CLIP uses an image encoder and text encoder to get visual features and text features. Both features are projected to a latent space with the same number of dimensions and their dot product gives a similarity score.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original CLIP checkpoints under the [OpenAI](https://huggingface.co/openai?search_models=clip) organization.
+## Overview

-> [!TIP]
-> Click on the CLIP models in the right sidebar for more examples of how to apply CLIP to different image and language tasks.
+The CLIP model was proposed in [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh,
+Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. CLIP
+(Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be
+instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing
+for the task, similarly to the zero-shot capabilities of GPT-2 and 3.

-The example below demonstrates how to calculate similarity scores between multiple text descriptions and an image with [`Pipeline`] or the [`AutoModel`] class.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+*State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This
+restricted form of supervision limits their generality and usability since additional labeled data is needed to specify
+any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a
+much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes
+with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400
+million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference
+learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study
+the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks
+such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The
+model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need
+for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot
+without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained
+model weights at this https URL.*

-```py
-import torch
-from transformers import pipeline
+This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).

-clip = pipeline(
-   task="zero-shot-image-classification",
-   model="openai/clip-vit-base-patch32",
-   torch_dtype=torch.bfloat16,
-   device=0
-)
-labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]
-clip("http://images.cocodataset.org/val2017/000000039769.jpg", candidate_labels=labels)
+## Usage tips and example
+
+CLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
+classification. CLIP uses a ViT like transformer to get visual features and a causal language model to get the text
+features. Both the text and visual features are then projected to a latent space with identical dimension. The dot
+product between the projected image and text features is then used as a similar score.
+
+To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
+also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
+The [`CLIPImageProcessor`] can be used to resize (or rescale) and normalize images for the model.
+
+The [`CLIPTokenizer`] is used to encode the text. The [`CLIPProcessor`] wraps
+[`CLIPImageProcessor`] and [`CLIPTokenizer`] into a single instance to both
+encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
+[`CLIPProcessor`] and [`CLIPModel`].
+
+
+```python
+>>> from PIL import Image
+>>> import requests
+
+>>> from transformers import CLIPProcessor, CLIPModel
+
+>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 ```

-</hfoption>
-<hfoption id="AutoModel">

-```py
-import requests
-import torch
-from PIL import Image
-from transformers import AutoProcessor, AutoModel
+### Combining CLIP and Flash Attention 2

-model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.bfloat16, attn_implementation="sdpa")
-processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+First, make sure to install the latest version of Flash Attention 2.

-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]
-
-inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
-
-outputs = model(**inputs)
-logits_per_image = outputs.logits_per_image
-probs = logits_per_image.softmax(dim=1)
-most_likely_idx = probs.argmax(dim=1).item()
-most_likely_label = labels[most_likely_idx]
-print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_likely_idx].item():.3f}")
+```bash
+pip install -U flash-attn --no-build-isolation
 ```

-</hfoption>
-</hfoptions>
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)

-## Notes
+<Tip warning={true}>

- Use [`CLIPImageProcessor`] to resize (or rescale) and normalizes images for the model.
+For small batch sizes, you might notice a slowdown in your model when using flash attention. Refer to the section [Expected speedups with Flash Attention and SDPA](#Expected-speedups-with-Flash-Attention-and-SDPA) below and select an appropriate attention implementation.
+
+</Tip>
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> import requests
+>>> from PIL import Image
+
+>>> from transformers import CLIPProcessor, CLIPModel
+
+>>> device = "cuda"
+>>> torch_dtype = torch.float16
+
+>>> model = CLIPModel.from_pretrained(
+...     "openai/clip-vit-base-patch32",
+...     attn_implementation="flash_attention_2",
+...     device_map=device,
+...     torch_dtype=torch_dtype,
+... )
+>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+>>> inputs.to(device)
+
+>>> with torch.no_grad():
+...     with torch.autocast(device):
+...         outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+>>> print(probs)
+tensor([[0.9946, 0.0052]], device='cuda:0', dtype=torch.float16)
+```
+
+
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```python
+from transformers import CLIPModel
+
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float16, attn_implementation="sdpa")
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+### Expected speedups with Flash Attention and SDPA
+
+On a local benchmark (NVIDIA A10G, PyTorch 2.3.1+cu121) with `float16`, we saw the following speedups during inference for `"openai/clip-vit-large-patch14"` checkpoint ([code](https://gist.github.com/qubvel/ac691a54e54f9fae8144275f866a7ff8)):
+
+#### CLIPTextModel
+
+|   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
+|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+|                 4 |            0.009 |          0.012 |         0.737 |           0.007 |          1.269 |
+|                16 |            0.009 |          0.014 |         0.659 |           0.008 |          1.187 |
+|                32 |            0.018 |          0.021 |         0.862 |           0.016 |          1.142 |
+|                64 |            0.034 |          0.034 |         1.001 |           0.03  |          1.163 |
+|               128 |            0.063 |          0.058 |         1.09  |           0.054 |          1.174 |
+
+![clip_text_model_viz_3](https://github.com/user-attachments/assets/e9826b43-4e66-4f4c-952b-af4d90bd38eb)
+
+#### CLIPVisionModel
+
+|   Image batch size |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
+|-------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+|                  1 |            0.016 |          0.013 |         1.247 |           0.012 |          1.318 |
+|                  4 |            0.025 |          0.021 |         1.198 |           0.021 |          1.202 |
+|                 16 |            0.093 |          0.075 |         1.234 |           0.075 |          1.24  |
+|                 32 |            0.181 |          0.147 |         1.237 |           0.146 |          1.241 |
+
+![clip_image_model_viz_3](https://github.com/user-attachments/assets/50a36206-e3b9-4adc-ac8e-926b8b071d63)
+
+#### CLIPModel
+
+|   Image batch size |   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
+|-------------------:|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+|                  1 |                 4 |            0.025 |          0.026 |         0.954 |           0.02  |          1.217 |
+|                  1 |                16 |            0.026 |          0.028 |         0.918 |           0.02  |          1.287 |
+|                  1 |                64 |            0.042 |          0.046 |         0.906 |           0.036 |          1.167 |
+|                  4 |                 4 |            0.028 |          0.033 |         0.849 |           0.024 |          1.189 |
+|                  4 |                16 |            0.034 |          0.035 |         0.955 |           0.029 |          1.169 |
+|                  4 |                64 |            0.059 |          0.055 |         1.072 |           0.05  |          1.179 |
+|                 16 |                 4 |            0.096 |          0.088 |         1.091 |           0.078 |          1.234 |
+|                 16 |                16 |            0.102 |          0.09  |         1.129 |           0.083 |          1.224 |
+|                 16 |                64 |            0.127 |          0.11  |         1.157 |           0.105 |          1.218 |
+|                 32 |                 4 |            0.185 |          0.159 |         1.157 |           0.149 |          1.238 |
+|                 32 |                16 |            0.19  |          0.162 |         1.177 |           0.154 |          1.233 |
+|                 32 |                64 |            0.216 |          0.181 |         1.19  |           0.176 |          1.228 |
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP.
+
+- [Fine tuning CLIP with Remote Sensing (Satellite) images and captions](https://huggingface.co/blog/fine-tune-clip-rsicd), a blog post about how to fine-tune CLIP with [RSICD dataset](https://github.com/201528014227051/RSICD_optimal) and comparison of performance changes due to data augmentation.
+- This [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text) shows how to train a CLIP-like vision-text dual encoder model using a pre-trained vision and text encoder using [COCO dataset](https://cocodataset.org/#home).
+
+<PipelineTag pipeline="image-to-text"/>
+
+- A [notebook](https://colab.research.google.com/drive/1tuoAC5F4sC7qid56Z0ap-stR3rwdk0ZV?usp=sharing) on how to use a pretrained CLIP for inference with beam search for image captioning. 🌎
+
+**Image retrieval**
+
+- A [notebook](https://colab.research.google.com/drive/1bLVwVKpAndpEDHqjzxVPr_9nGrSbuOQd?usp=sharing) on image retrieval using pretrained CLIP and computing MRR(Mean Reciprocal Rank) score. 🌎
+- A [notebook](https://colab.research.google.com/github/deep-diver/image_search_with_natural_language/blob/main/notebooks/Image_Search_CLIP.ipynb) on image retrieval and showing the similarity score. 🌎
+- A [notebook](https://colab.research.google.com/drive/1xO-wC_m_GNzgjIBQ4a4znvQkvDoZJvH4?usp=sharing) on how to map images and texts to the same vector space using Multilingual CLIP. 🌎 
+- A [notebook](https://colab.research.google.com/github/vivien000/clip-demo/blob/master/clip.ipynb#scrollTo=uzdFhRGqiWkR) on how to run CLIP on semantic image search using [Unsplash](https://unsplash.com) and [TMDB](https://www.themoviedb.org/) datasets. 🌎
+
+**Explainability**
+
+- A [notebook](https://colab.research.google.com/github/hila-chefer/Transformer-MM-Explainability/blob/main/CLIP_explainability.ipynb) on how to visualize similarity between input token and image segment. 🌎
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
+The resource should ideally demonstrate something new instead of duplicating an existing resource.

 ## CLIPConfig

--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -14,154 +14,108 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
-    </div>
-</div>
-
 # CodeLlama

-[Code Llama](https://huggingface.co/papers/2308.12950) is a specialized family of large language models based on [Llama 2](./llama2) for coding tasks.  It comes in different flavors - general code, Python-specific, and instruction-following variant - all available in 7B, 13B, 34B, and 70B parameters. Code Llama models can generate, explain, and even fill in missing parts of your code (called "infilling"). It can also handle very long contexts with stable generation up to 100k tokens, even though it was trained on sequences of 16K tokens.
-
-You can find all the original Code Llama checkpoints under the [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933) collection.
-
-> [!TIP]
-> Click on the Code Llama models in the right sidebar for more examples of how to apply Code Llama to different coding tasks.
-
-The example below demonstrates how to generate code with [`Pipeline`], or the [`AutoModel`], and from the command line.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    "text-generation",
-    model="meta-llama/CodeLlama-7b-hf",
-    torch_dtype=torch.float16,
-    device_map=0
-)
-
-# basic code generation
-result = pipe("# Function to calculate the factorial of a number\ndef factorial(n):", max_new_tokens=256)
-print(result[0]['generated_text'])
-
-# infilling
-infill_result = pipe("def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result", max_new_tokens=200)
-print(infill_result[0]['generated_text'])
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
-model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/CodeLlama-7b-hf",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    attn_implementation="sdpa"
-)
-
-# basic code generation
-prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
-input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
-
-output = model.generate(
-    **input_ids,
-    max_new_tokens=256,
-    cache_implementation="static"
-)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-
-# infilling
-infill_prompt = "def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result"
-input_ids = tokenizer(infill_prompt, return_tensors="pt").to(model.device)
-
-filled_output = model.generate(**input_ids, max_new_tokens=200)
-filled_text = tokenizer.decode(filled_output[0], skip_special_tokens=True)
-print(filled_text)
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
-
-```py
-# pip install bitsandbytes
-import torch
-from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, BitsAndBytesConfig
-
-bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
-tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-34b-hf")
-model = AutoModelForCausalLM.from_pretrained(
-   "meta-llama/CodeLlama-34b-hf",
-   torch_dtype=torch.bfloat16,
-   device_map="auto",
-   quantization_config=bnb_config
-)
-
-prompt = "# Write a Python function to check if a string is a palindrome\ndef is_palindrome(s):"
-input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
-
-output = model.generate(**input_ids, max_new_tokens=200, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
-
-```py
-from transformers.utils.attention_visualizer import AttentionMaskVisualizer
-
-visualizer = AttentionMaskVisualizer("meta-llama/CodeLlama-7b-hf")
-visualizer("""def func(a, b):
-  return a + b""")
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/codellama-attn-mask.png"/>
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
 </div>

-## Notes
+## Overview

- Infilling is only available in the 7B and 13B base models, and not in the Python, Instruct, 34B, or 70B models.
- Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
-    ```py
-    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+The Code Llama model was proposed in [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.

-    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
-    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
-    PROMPT = '''def remove_non_ascii(s: str) -> str:
-        """ <FILL_ME>
-        return result
-    '''
-    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
-    generated_ids = model.generate(input_ids, max_new_tokens=128)
+The abstract from the paper is the following:
+
+*We release Code Llama, a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. We provide multiple flavors to cover a wide range of applications: foundation models (Code Llama), Python specializations (Code Llama - Python), and instruction-following models (Code Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained on sequences of 16k tokens and show improvements on inputs with up to 100k tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support infilling based on surrounding content. Code Llama reaches state-of-the-art performance among open models on several code benchmarks, with scores of up to 53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform every other publicly available model on MultiPL-E. We release Code Llama under a permissive license that allows for both research and commercial use.*
+
+Check out all Code Llama model checkpoints [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [Meta Llama org](https://huggingface.co/meta-llama).
+
+This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+
+## Usage tips and examples
+
+<Tip warning={true}>
+
+The `Llama2` family models, on which Code Llama is based, were trained using `bfloat16`, but the original inference uses `float16`. Let's look at the different precisions:
+
+* `float32`: PyTorch convention on model initialization is to load models in `float32`, no matter with which `dtype` the model weights were stored. `transformers` also follows this convention for consistency with PyTorch. This will be picked by default. If you want the `AutoModel` API to load the checkpoints with the storage weights type, you must specify `torch_dtype="auto"`, e.g. `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`.
+* `bfloat16`: Code Llama was trained with this precision, so we recommend using it for further training or fine-tuning.
+* `float16`: We recommend running inference using this precision, as it's usually faster than `bfloat16`, and evaluation metrics show no discernible degradation with respect to `bfloat16`. You can also run inference using `bfloat16`, and we recommend you check inference results with both `float16` and `bfloat16` after fine-tuning.
+
+As mentioned above, the `dtype` of the storage weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using. The reason is that the model will first be downloaded (using the `dtype` of the checkpoints online) and then will be casted to the default `dtype` of `torch` (becomes `torch.float32`). If there is a specified `torch_dtype`, it will be used instead.
+
+</Tip>
+
+
+Tips:
+- The infilling task is supported out of the box. You should be using the `tokenizer.fill_token` where you want your input to be filled.
+- The model conversion script is the same as for the `Llama2` family:
+
+Here is a sample usage:
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+
+After conversion, the model and tokenizer can be loaded via:
+
+```python
+>>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+
+>>> tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+>>> model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
+>>> PROMPT = '''def remove_non_ascii(s: str) -> str:
+...     """ <FILL_ME>
+...     return result
+... '''
+>>> input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
+>>> generated_ids = model.generate(input_ids, max_new_tokens=128)
+
+>>> filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
+>>> print(PROMPT.replace("<FILL_ME>", filling))
+def remove_non_ascii(s: str) -> str:
+    """ Remove non-ASCII characters from a string.
+<BLANKLINE>
+    Args:
+        s: The string to remove non-ASCII characters from.
+<BLANKLINE>
+    Returns:
+        The string with non-ASCII characters removed.
+    """
+    result = ""
+    for c in s:
+        if ord(c) < 128:
+            result += c
+    return result
+<BLANKLINE>
+```
+
+If you only want the infilled part:
+```python
+>>> from transformers import pipeline
+>>> import torch
+
+>>> generator = pipeline("text-generation",model="meta-llama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
+>>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128)
+[{'generated_text': 'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return resultRemove non-ASCII characters from a string. """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c'}]
+```
+
+Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug.  To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
+
+The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+
+<Tip>
+
+Code Llama has the same architecture as the `Llama2` models, refer to [Llama2's documentation page](llama2) for the API reference.
+Find Code Llama tokenizer reference below. 
+</Tip>

-    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
-    print(PROMPT.replace("<FILL_ME>", filling))
-    ```
- Use `bfloat16` for further training or fine-tuning and `float16` for inference.
- The `BOS` character is not used for infilling when encoding the prefix or suffix, but only at the beginning of each prompt.
- The tokenizer is a byte-pair encoding model based on [SentencePiece](https://github.com/google/sentencepiece). During decoding, if the first token is the start of the word (for example, “Banana”), the tokenizer doesn’t prepend the prefix space to the string.

 ## CodeLlamaTokenizer

--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -1,115 +1,124 @@
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-
 # Cohere

-Cohere Command-R is a 35B parameter multilingual large language model designed for long context tasks like retrieval-augmented generation (RAG) and calling external APIs and tools. The model is specifically trained for grounded generation and supports both single-step and multi-step tool use. It supports a context length of 128K tokens.
-
-You can find all the original Command-R checkpoints under the [Command Models](https://huggingface.co/collections/CohereForAI/command-models-67652b401665205e17b192ad) collection.
-
-
-> [!TIP]
-> Click on the Cohere models in the right sidebar for more examples of how to apply Cohere to different language tasks.
-
-The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`], and from the command line.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```python
-import torch
-from transformers import pipeline
-
-pipeline = pipeline(
-    task="text-generation",
-    model="CohereForAI/c4ai-command-r-v01",
-    torch_dtype=torch.float16,
-    device=0
-)
-pipeline("Plants create energy through a process known as")
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```python
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
-model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
-
-# format message with the Command-R chat template
-messages = [{"role": "user", "content": "How do plants make energy?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
-output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    cache_implementation="static",
-)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-# pip install -U flash-attn --no-build-isolation
-transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits.
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
-
-bnb_config = BitsAndBytesConfig(load_in_4bit=True)
-tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
-model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", torch_dtype=torch.float16, device_map="auto", quantization_config=bnb_config, attn_implementation="sdpa")
-
-# format message with the Command-R chat template
-messages = [{"role": "user", "content": "How do plants make energy?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
-output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    cache_implementation="static",
-)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
-
-```py
-from transformers.utils.attention_visualizer import AttentionMaskVisualizer
-
-visualizer = AttentionMaskVisualizer("CohereForAI/c4ai-command-r-v01")
-visualizer("Plants create energy through a process known as")
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/cohere-attn-mask.png"/>
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

+## Overview
+
+The Cohere Command-R model was proposed in the blogpost [Command-R: Retrieval Augmented Generation at Production Scale](https://txt.cohere.com/command-r/) by the Cohere Team.
+
+The abstract from the paper is the following:
+
+*Command-R is a scalable generative model targeting RAG and Tool Use to enable production-scale AI for enterprise. Today, we are introducing Command-R, a new LLM aimed at large-scale production workloads. Command-R targets the emerging “scalable” category of models that balance high efficiency with strong accuracy, enabling companies to move beyond proof of concept, and into production.*
+
+*Command-R is a generative model optimized for long context tasks such as retrieval augmented generation (RAG) and using external APIs and tools. It is designed to work in concert with our industry-leading Embed and Rerank models to provide best-in-class integration for RAG applications and excel at enterprise use cases. As a model built for companies to implement at scale, Command-R boasts:
+- Strong accuracy on RAG and Tool Use
+- Low latency, and high throughput
+- Longer 128k context and lower pricing
+- Strong capabilities across 10 key languages
+- Model weights available on HuggingFace for research and evaluation
+
+Checkout model checkpoints [here](https://huggingface.co/CohereForAI/c4ai-command-r-v01).
+This model was contributed by [Saurabh Dash](https://huggingface.co/saurabhdash) and [Ahmet Üstün](https://huggingface.co/ahmetustun). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox).
+
+## Usage tips
+
+<Tip warning={true}>
+
+The checkpoints uploaded on the Hub use `torch_dtype = 'float16'`, which will be
+used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. 
+
+The `dtype` of the online weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online), then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`), and finally, if there is a `torch_dtype` provided in the config, it will be used. 
+
+Training the model in `float16` is not recommended and is known to produce `nan`; as such, the model should be trained in `bfloat16`.
+
+</Tip>
+The model and tokenizer can be loaded via:
+
+```python
+# pip install transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "CohereForAI/c4ai-command-r-v01"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+# Format message with the command-r chat template
+messages = [{"role": "user", "content": "Hello, how are you?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+
+gen_tokens = model.generate(
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
+    temperature=0.3,
+    )
+
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+
+- When using Flash Attention 2 via `attn_implementation="flash_attention_2"`, don't pass `torch_dtype` to the `from_pretrained` class method and use Automatic Mixed-Precision training. When using `Trainer`, it is simply specifying either `fp16` or `bf16` to `True`. Otherwise, make sure you are using `torch.autocast`. This is required because the Flash Attention only support `fp16` and `bf16` data type.
+
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Command-R. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
+<PipelineTag pipeline="text-generation"/>
+
+Loading FP16 model
+```python
+# pip install transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "CohereForAI/c4ai-command-r-v01"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+# Format message with the command-r chat template
+messages = [{"role": "user", "content": "Hello, how are you?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+
+gen_tokens = model.generate(
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
+    temperature=0.3,
+    )
+
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+
+Loading bitsnbytes 4bit quantized model
+```python
+# pip install transformers bitsandbytes accelerate
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+
+model_id = "CohereForAI/c4ai-command-r-v01"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
+
+gen_tokens = model.generate(
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
+    temperature=0.3,
+    )
+
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```

-## Notes
- Don’t use the torch_dtype parameter in [`~AutoModel.from_pretrained`] if you’re using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to True if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast).

 ## CohereConfig

@ -134,3 +143,5 @@ visualizer("Plants create energy through a process known as")

 [[autodoc]] CohereForCausalLM
    - forward
+
+
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@ -1,4 +1,5 @@
 <!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

@ -8,135 +9,77 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
-->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
+-->

 # ColPali

-[ColPali](https://huggingface.co/papers/2407.01449) is a model designed to retrieve documents by analyzing their visual features. Unlike traditional systems that rely heavily on text extraction and OCR, ColPali treats each page as an image. It uses [Paligemma-3B](./paligemma) to capture not only text, but also the layout, tables, charts, and other visual elements to create detailed embeddings. This offers a more comprehensive understanding of documents and enables more efficient and accurate retrieval.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original ColPali checkpoints under the [ColPali](https://huggingface.co/collections/vidore/hf-native-colvision-models-6755d68fc60a8553acaa96f7) collection.
+## Overview

-> [!TIP]
-> Click on the ColPali models in the right sidebar for more examples of how to use ColPali for image retrieval.
+The *ColPali* model was proposed in [ColPali: Efficient Document Retrieval with Vision Language Models](https://doi.org/10.48550/arXiv.2407.01449) by **Manuel Faysse***, **Hugues Sibille***, **Tony Wu***, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (* denotes equal contribution). Work lead by ILLUIN Technology.

-<hfoptions id="usage">
-<hfoption id="image retrieval">
+In our proposed *ColPali* approach, we leverage VLMs to construct efficient multi-vector embeddings directly from document images (“screenshots”) for document retrieval. We train the model to maximize the similarity between these document embeddings and the corresponding query embeddings, using the late interaction method introduced in ColBERT.

-```py
-import requests
+Using *ColPali* removes the need for potentially complex and brittle layout recognition and OCR pipelines with a single model that can take into account both the textual and visual content (layout, charts, etc.) of a document.
+
+## Resources
+
+- The *ColPali* arXiv paper can be found [here](https://doi.org/10.48550/arXiv.2407.01449). 📄
+- The official blog post detailing ColPali can be found [here](https://huggingface.co/blog/manu/colpali). 📝
+- The original model implementation code for the ColPali model and for the `colpali-engine` package can be found [here](https://github.com/illuin-tech/colpali). 🌎
+- Cookbooks for learning to use the transformers-native version of *ColPali*, fine-tuning, and similarity maps generation can be found [here](https://github.com/tonywu71/colpali-cookbooks). 📚
+
+This model was contributed by [@tonywu71](https://huggingface.co/tonywu71) and [@yonigozlan](https://huggingface.co/yonigozlan).
+
+## Usage
+
+This example demonstrates how to use *ColPali* to embed both queries and images, calculate their similarity scores, and identify the most relevant matches. For a specific query, you can retrieve the top-k most similar images by selecting the ones with the highest similarity scores.
+
+```python
 import torch
 from PIL import Image
+
 from transformers import ColPaliForRetrieval, ColPaliProcessor

-# Load model (bfloat16 support is limited; fallback to float32 if needed)
-model = ColPaliForRetrieval.from_pretrained(
-    "vidore/colpali-v1.2-hf",
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",  # "cpu", "cuda", or "mps" for Apple Silicon
-).eval()
-
-processor = ColPaliProcessor.from_pretrained(model_name)
-
-url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
-url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"
-
-images = [
-    Image.open(requests.get(url1, stream=True).raw),
-    Image.open(requests.get(url2, stream=True).raw),
-]
-
-queries = [
-    "Who printed the edition of Romeo and Juliet?",
-    "When was the United States Declaration of Independence proclaimed?",
-]
-
-# Process the inputs
-inputs_images = processor(images=images, return_tensors="pt").to(model.device)
-inputs_text = processor(text=queries, return_tensors="pt").to(model.device)
-
-# Forward pass
-with torch.no_grad():
-    image_embeddings = model(**inputs_images).embeddings
-    query_embeddings = model(**inputs_text).embeddings
-
-scores = processor.score_retrieval(query_embeddings, image_embeddings)
-
-print("Retrieval scores (query x image):")
-print(scores)
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes.md) to quantize the weights to int4.
-
-```py
-import requests
-import torch
-from PIL import Image
-from transformers import ColPaliForRetrieval, ColPaliProcessor
-from transformers import BitsAndBytesConfig
-
-# 4-bit quantization configuration
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16,
-)
-
 model_name = "vidore/colpali-v1.2-hf"

-# Load model 
 model = ColPaliForRetrieval.from_pretrained(
    model_name,
-    quantization_config=bnb_config,
-    device_map="cuda"
+    torch_dtype=torch.bfloat16,
+    device_map="cuda:0",  # or "mps" if on Apple Silicon
 ).eval()

 processor = ColPaliProcessor.from_pretrained(model_name)

-url1 = "https://upload.wikimedia.org/wikipedia/commons/8/89/US-original-Declaration-1776.jpg"
-url2 = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Romeoandjuliet1597.jpg/500px-Romeoandjuliet1597.jpg"
-
+# Your inputs (replace dummy images with screenshots of your documents)
 images = [
-    Image.open(requests.get(url1, stream=True).raw),
-    Image.open(requests.get(url2, stream=True).raw),
+    Image.new("RGB", (32, 32), color="white"),
+    Image.new("RGB", (16, 16), color="black"),
 ]
-
 queries = [
-    "Who printed the edition of Romeo and Juliet?",
-    "When was the United States Declaration of Independence proclaimed?",
+    "What is the organizational structure for our R&D department?",
+    "Can you provide a breakdown of last year’s financial performance?",
 ]

 # Process the inputs
-inputs_images = processor(images=images, return_tensors="pt").to(model.device)
-inputs_text = processor(text=queries, return_tensors="pt").to(model.device)
+batch_images = processor(images=images).to(model.device)
+batch_queries = processor(text=queries).to(model.device)

 # Forward pass
 with torch.no_grad():
-    image_embeddings = model(**inputs_images).embeddings
-    query_embeddings = model(**inputs_text).embeddings
+    image_embeddings = model(**batch_images).embeddings
+    query_embeddings = model(**batch_queries).embeddings

+# Score the queries against the images
 scores = processor.score_retrieval(query_embeddings, image_embeddings)
-
-print("Retrieval scores (query x image):")
-print(scores)
 ```

-## Notes
-
- [`~ColPaliProcessor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
-
 ## ColPaliConfig

 [[autodoc]] ColPaliConfig
--- a/docs/source/en/model_doc/conditional_detr.md
+++ b/docs/source/en/model_doc/conditional_detr.md
@ -48,11 +48,6 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o

 [[autodoc]] ConditionalDetrImageProcessor
    - preprocess
-
-## ConditionalDetrImageProcessorFast
-
-[[autodoc]] ConditionalDetrImageProcessorFast
-    - preprocess
    - post_process_object_detection
    - post_process_instance_segmentation
    - post_process_semantic_segmentation
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -1,377 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Csm
-
-## Overview
-
-The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.
-
-**Model Architecture:**
-CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
-
-The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/csm_architecture.png"/>
-</div>
-
-## Usage Tips
-
-### Without Conversational Context
-
-CSM can be used to simply generate speech from a text prompt:
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs
-text = "[0]The past is just a story we tell ourselves." # `[0]` for speaker id 0
-inputs = processor(text, add_special_tokens=True).to(device)
-
-# another equivalent way to prepare the inputs
-conversation = [
-    {"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
-]
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-# infer the model
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, "example_without_context.wav")
-```
-
-### With Conversational Context
-
-CSM can be used to generate speech given a conversation, allowing consistency in the voices and content-aware generation:
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-conversation = []
-
-# 1. context
-for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
-    conversation.append(
-        {
-            "role": f"{speaker_id}",
-            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
-        }
-    )
-
-# 2. text prompt
-conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-# infer the model
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, "example_with_context.wav")
-```
-
-### Batched Inference
-
-CSM supports batched inference!
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs 
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-# here a batch with two prompts
-conversation = [
-    [
-        {
-            "role": f"{ds[0]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[0]["text"]},
-                {"type": "audio", "path": ds[0]["audio"]["array"]},
-            ],
-        },
-        {
-            "role": f"{ds[1]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[1]["text"]},
-            ],
-        },
-    ],
-    [
-        {
-            "role": f"{ds[0]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[0]["text"]},
-            ],
-        }
-    ],
-]
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
-```
-
-### Making The Model Go Brrr
-
-CSM supports full-graph compilation with CUDA graphs!
-
-```python
-import torch
-import copy
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset
-
-model_id = "eustlb/csm-1b"
-device = "cuda"
-
-# set logs to ensure no recompilation and graph breaks
-torch._logging.set_logs(graph_breaks=True, recompiles=True, cudagraphs=True)
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# use static cache, enabling automatically torch compile with fullgraph and reduce-overhead
-model.generation_config.max_length = 250 # big enough to avoid recompilation
-model.generation_config.max_new_tokens = None # would take precedence over max_length
-model.generation_config.cache_implementation = "static"
-model.depth_decoder.generation_config.cache_implementation = "static"
-
-# generation kwargs
-gen_kwargs = {
-    "do_sample": False,
-    "depth_decoder_do_sample": False,
-    "temperature": 1.0,
-    "depth_decoder_temperature": 1.0,
-}
-
-# Define a timing decorator
-class TimerContext:
-    def __init__(self, name="Execution"):
-        self.name = name
-        self.start_event = None
-        self.end_event = None
-        
-    def __enter__(self):
-        # Use CUDA events for more accurate GPU timing
-        self.start_event = torch.cuda.Event(enable_timing=True)
-        self.end_event = torch.cuda.Event(enable_timing=True)
-        self.start_event.record()
-        return self
-
-    def __exit__(self, *args):
-        self.end_event.record()
-        torch.cuda.synchronize()
-        elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0
-        print(f"{self.name} time: {elapsed_time:.4f} seconds")
-
-# prepare the inputs 
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-
-conversation = [
-    {
-        "role": f"{ds[0]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[0]["text"]},
-            {"type": "audio", "path": ds[0]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[1]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[1]["text"]},
-            {"type": "audio", "path": ds[1]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[2]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[2]["text"]},
-        ],
-    },
-]
-
-padded_inputs_1 = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-print("\n" + "="*50)
-print("First generation - compiling and recording CUDA graphs...")
-with TimerContext("First generation"):
-    _ = model.generate(**padded_inputs_1, **gen_kwargs)
-print("="*50)
-
-print("\n" + "="*50)
-print("Second generation - fast !!!")
-with TimerContext("Second generation"):
-    _ = model.generate(**padded_inputs_1, **gen_kwargs)
-print("="*50)
-
-# now with different inputs
-conversation = [
-    {
-        "role": f"{ds[0]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[2]["text"]},
-            {"type": "audio", "path": ds[2]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[1]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[3]["text"]},
-            {"type": "audio", "path": ds[3]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[2]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[4]["text"]},
-        ],
-    },
-]
-padded_inputs_2 = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-print("\n" + "="*50)
-print("Generation with other inputs!")
-with TimerContext("Generation with different inputs"):
-    _ = model.generate(**padded_inputs_2, **gen_kwargs)
-print("="*50)
-```
-
-### Training
-
-CSM Transformers integration supports training!
-
-```python
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-model.train()
-
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-conversation = []
-
-# context
-for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
-    conversation.append(
-        {
-            "role": f"{speaker_id}",
-            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
-        }
-    )
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-    output_labels=True,
-).to(device)
-
-out = model(**inputs)
-out.loss.backward()
-```
-
-This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
-The original code can be found [here](https://github.com/SesameAILabs/csm).
-
-
-## CsmConfig
-
-[[autodoc]] CsmConfig
-
-## CsmDepthDecoderConfig
-
-[[autodoc]] CsmDepthDecoderConfig
-
-## CsmProcessor
-
-[[autodoc]] CsmProcessor
-    - __call__
-
-## CsmForConditionalGeneration
-
-[[autodoc]] CsmForConditionalGeneration
-    - forward
-    - generate
-
-## CsmDepthDecoderForCausalLM
-
-[[autodoc]] CsmDepthDecoderForCausalLM
-
-## CsmDepthDecoderModel
-
-[[autodoc]] CsmDepthDecoderModel
-
-## CsmBackboneModel
-
-[[autodoc]] CsmBackboneModel
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Mohamed Mekkouri	0fca0f2601	Merge branch 'main' into fix-compressed-tensors	2025-03-25 10:44:05 +01:00
Mohamed Mekkouri	faad39c684	Merge branch 'main' into fix-compressed-tensors	2025-03-24 19:55:23 +01:00
Marc Sun	f92a3c5836	fix	2025-03-24 12:20:44 +01:00
Marc Sun	d4e6a763b9	style	2025-03-24 12:17:59 +01:00
Marc Sun	ed8e673ee9	fix	2025-03-24 12:16:52 +01:00
Marc Sun	eab4db5e1e	style	2025-03-24 11:48:18 +01:00
Marc Sun	9dff86d7a3	fix compressed-tensors	2025-03-24 11:42:58 +01:00