Release: v4.52.1

Revert parallelism temporarily (#38240 )
* Revert "Protect ParallelInterface" This reverts commit cb513e35f9c096d60558bd43110837cbb66611ce. * Revert "parallelism goes brrr (#37877)" This reverts commit 1c2f36b480e02c9027d2523746d34e27b39e01a4. * Empty commit
2025-10-21 01:23:56 +08:00 · 2025-05-20 22:45:10 +02:00 · 2025-05-20 22:43:54 +02:00 · 2025-05-20 18:26:11 +02:00 · 2025-05-20 18:11:22 +02:00 · 2025-05-20 17:35:04 +02:00
3448 changed files with 191349 additions and 277371 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,6 +7,18 @@ parameters:
    nightly:
        type: boolean
        default: false
+    GHA_Actor:
+        type: string
+        default: ""
+    GHA_Action:
+        type: string
+        default: ""
+    GHA_Event:
+        type: string
+        default: ""
+    GHA_Meta:
+        type: string
+        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
@ -31,8 +43,10 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
+            - run: python3 utils/extract_pr_number_from_circleci.py > pr_number.txt
+            - run: echo $(cat pr_number.txt)
+            - run: if [[ "$(cat pr_number.txt)" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
+            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$(cat pr_number.txt) >> github.txt'
            - run: cat github.txt
            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
@ -154,7 +168,7 @@ jobs:
                  path: ~/transformers/installed.txt
            - run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
            - run: ruff check examples tests src utils
-            - run: ruff format tests src utils --check
+            - run: ruff format examples tests src utils --check
            - run: python utils/custom_init_isort.py --check_only
            - run: python utils/sort_auto_mappings.py --check_only
            - run: python utils/check_doc_toc.py
@ -178,8 +192,7 @@ jobs:
            - store_artifacts:
                  path: ~/transformers/installed.txt
            - run: python utils/check_copies.py
-            - run: python utils/check_modular_conversion.py --num_workers 4
-            - run: python utils/check_table.py
+            - run: python utils/check_modular_conversion.py
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
            - run: python utils/check_inits.py
@ -189,7 +202,6 @@ jobs:
            - run: make deps_table_check_updated
            - run: python utils/update_metadata.py --check-only
            - run: python utils/check_docstrings.py
-            - run: python utils/check_support_list.py

 workflows:
    version: 2
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,11 +28,32 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
+    # will be adjust in `CircleCIJob.to_dict`.
+    "RUN_FLAKY": True,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
-COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None}
+COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
 DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]

+# Strings that commonly appear in the output of flaky tests when they fail. These are used with `pytest-rerunfailures`
+# to rerun the tests that match these patterns.
+FLAKY_TEST_FAILURE_PATTERNS = [
+    "OSError",  # Machine/connection transient error
+    "Timeout",  # Machine/connection transient error
+    "ConnectionError",  # Connection transient error
+    "FileNotFoundError",  # Raised by `datasets` on Hub failures
+    "PIL.UnidentifiedImageError",  # Raised by `PIL.Image.open` on connection issues
+    "HTTPError",  # Also catches HfHubHTTPError
+    "AssertionError: Tensor-likes are not close!",  # `torch.testing.assert_close`, we might have unlucky random values
+    # TODO: error downloading tokenizer's `merged.txt` from hub can cause all the exceptions below. Throw and handle
+    # them under a single message.
+    "TypeError: expected str, bytes or os.PathLike object, not NoneType",
+    "TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType",
+    "Converting from Tiktoken failed",
+    "KeyError: <class ",
+    "TypeError: not a string",
+]
+

 class EmptyJob:
    job_name = "empty"
@ -89,6 +110,7 @@ class CircleCIJob:
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
            self.install_steps = ["uv venv && uv pip install ."]
+        self.install_steps.append("uv venv && uv pip install git+https://github.com/ydshieh/pytest.git@8.3.5-ydshieh git+https://github.com/ydshieh/pluggy.git@1.5.0-ydshieh")
        if self.pytest_options is None:
            self.pytest_options = {}
        if isinstance(self.tests_to_run, str):
@ -107,6 +129,8 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
+        # Do not run tests decorated by @is_flaky on pull requests
+        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
@ -124,7 +148,9 @@ class CircleCIJob:
                # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
        timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
        marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
-        additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
+        junit_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
+        joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS)
+        repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'"
        parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
        steps = [
            "checkout",
@ -150,9 +176,10 @@ class CircleCIJob:
                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
                    }
            },
+            {"run": {"name": "fetch hub objects before pytest", "command": "python3 utils/fetch_hub_objects_for_ci.py"}},
            {"run": {
                "name": "Run tests",
-                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
+                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
            },
            {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
            {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
@ -185,6 +212,9 @@ torch_job = CircleCIJob(
 generate_job = CircleCIJob(
    "generate",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    # networkx==3.3 (after #36957) cause some issues
+    # TODO: remove this once it works directly
+    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
    marker="generate",
    parallelism=6,
 )
@ -248,6 +278,7 @@ examples_torch_job = CircleCIJob(
    docker_image=[{"image":"huggingface/transformers-examples-torch"}],
    # TODO @ArthurZucker remove this once docker is easier to build
    install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
+    pytest_num_workers=4,
 )


@ -255,6 +286,7 @@ examples_tensorflow_job = CircleCIJob(
    "examples_tensorflow",
    additional_env={"OMP_NUM_THREADS": 8},
    docker_image=[{"image":"huggingface/transformers-examples-tf"}],
+    pytest_num_workers=2,
 )


@ -305,6 +337,9 @@ repo_utils_job = CircleCIJob(
 non_model_job = CircleCIJob(
    "non_model",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    # networkx==3.3 (after #36957) cause some issues
+    # TODO: remove this once it works directly
+    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
    marker="not generate",
    parallelism=6,
 )
@ -334,9 +369,9 @@ doc_test_job = CircleCIJob(
    pytest_num_workers=1,
 )

-REGULAR_TESTS = [torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
-EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
-PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
+REGULAR_TESTS = [torch_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
+EXAMPLES_TESTS = [examples_torch_job]
+PIPELINE_TESTS = [pipelines_torch_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
@ -363,7 +398,12 @@ def create_circleci_config(folder=None):
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": ''},
+            # Only used to accept the parameters from GitHub Actions trigger
+            "GHA_Actor": {"type": "string", "default": ""},
+            "GHA_Action": {"type": "string", "default": ""},
+            "GHA_Event": {"type": "string", "default": ""},
+            "GHA_Meta": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ""},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
@ -38,24 +38,30 @@ body:

          - text models: @ArthurZucker
          - vision models: @amyeroberts, @qubvel
-          - speech models: @ylacombe, @eustlb
+          - speech models: @eustlb
          - graph models: @clefourrier

        Library:

-          - flax: @sanchit-gandhi
+          - flax: @gante and @Rocketknight1
          - generate: @zucchini-nlp (visual-language models) or @gante (all others)
          - pipelines: @Rocketknight1
          - tensorflow: @gante and @Rocketknight1
          - tokenizers: @ArthurZucker and @itazap
-          - trainer: @muellerzr @SunMarc
+          - trainer: @zach-huggingface @SunMarc

        Integrations:

-          - deepspeed: HF Trainer/Accelerate: @muellerzr
+          - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
          - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
+        
+        Devices/Backends:
+        
+          - AMD ROCm: @ivarflakstad
+          - Intel XPU: @IlyasMoutawwakil
+          - Ascend NPU: @ivarflakstad 

        Documentation: @stevhliu

@ -72,7 +78,7 @@ body:

        Maintained examples (not research project or legacy):

-          - Flax: @sanchit-gandhi
+          - Flax: @Rocketknight1
          - PyTorch: See Models above and tag the person corresponding to the modality of the example.
          - TensorFlow: @Rocketknight1

--- a/.github/ISSUE_TEMPLATE/i18n.md
+++ b/.github/ISSUE_TEMPLATE/i18n.md
@ -23,7 +23,7 @@ Some notes:
 * Please translate in a gender-neutral way.
 * Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source).
 * Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml).
-* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu and @MKhalusova for review.
+* Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu for review.
 * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/).

 ## Get Started section
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -41,22 +41,22 @@ Models:

 - text models: @ArthurZucker
 - vision models: @amyeroberts, @qubvel
- speech models: @ylacombe, @eustlb
+- speech models: @eustlb
 - graph models: @clefourrier

 Library:

- flax: @sanchit-gandhi
+- flax: @gante and @Rocketknight1
 - generate: @zucchini-nlp (visual-language models) or @gante (all others)
 - pipelines: @Rocketknight1
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
- trainer: @muellerzr and @SunMarc
+- trainer: @zach-huggingface and @SunMarc
 - chat templates: @Rocketknight1

 Integrations:

- deepspeed: HF Trainer/Accelerate: @muellerzr
+- deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
 - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
@ -72,7 +72,7 @@ HF projects:

 Maintained examples (not research project or legacy):

- Flax: @sanchit-gandhi
+- Flax: @Rocketknight1
 - PyTorch: See Models above and tag the person corresponding to the modality of the example.
 - TensorFlow: @Rocketknight1

--- a/.github/scripts/assign_reviewers.py
+++ b/.github/scripts/assign_reviewers.py
@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import github
+import json
+from github import Github
+import re
+from collections import Counter
+from pathlib import Path
+
+def pattern_to_regex(pattern):
+    if pattern.startswith("/"):
+        start_anchor = True
+        pattern = re.escape(pattern[1:])
+    else:
+        start_anchor = False
+        pattern = re.escape(pattern)
+    # Replace `*` with "any number of non-slash characters"
+    pattern = pattern.replace(r"\*", "[^/]*")
+    if start_anchor:
+        pattern = r"^\/?" + pattern  # Allow an optional leading slash after the start of the string
+    return pattern
+
+def get_file_owners(file_path, codeowners_lines):
+    # Process lines in reverse (last matching pattern takes precedence)
+    for line in reversed(codeowners_lines):
+        # Skip comments and empty lines, strip inline comments
+        line = line.split('#')[0].strip()
+        if not line:
+            continue
+
+        # Split into pattern and owners
+        parts = line.split()
+        pattern = parts[0]
+        # Can be empty, e.g. for dummy files with explicitly no owner!
+        owners = [owner.removeprefix("@") for owner in parts[1:]]
+
+        # Check if file matches pattern
+        file_regex = pattern_to_regex(pattern)
+        if re.search(file_regex, file_path) is not None:
+            return owners  # Remember, can still be empty!
+    return []  # Should never happen, but just in case
+
+def pr_author_is_in_hf(pr_author, codeowners_lines):
+    # Check if the PR author is in the codeowners file
+    for line in codeowners_lines:
+        line = line.split('#')[0].strip()
+        if not line:
+            continue
+
+        # Split into pattern and owners
+        parts = line.split()
+        owners = [owner.removeprefix("@") for owner in parts[1:]]
+
+        if pr_author in owners:
+            return True
+    return False
+
+def main():
+    script_dir = Path(__file__).parent.absolute()
+    with open(script_dir / "codeowners_for_review_action") as f:
+        codeowners_lines = f.readlines()
+
+    g = Github(os.environ['GITHUB_TOKEN'])
+    repo = g.get_repo("huggingface/transformers")
+    with open(os.environ['GITHUB_EVENT_PATH']) as f:
+        event = json.load(f)
+
+    # The PR number is available in the event payload
+    pr_number = event['pull_request']['number']
+    pr = repo.get_pull(pr_number)
+    pr_author = pr.user.login
+    if pr_author_is_in_hf(pr_author, codeowners_lines):
+        print(f"PR author {pr_author} is in codeowners, skipping review request.")
+        return
+
+    existing_reviews = list(pr.get_reviews())
+    if existing_reviews:
+        print(f"Already has reviews: {[r.user.login for r in existing_reviews]}")
+        return
+
+    users_requested, teams_requested = pr.get_review_requests()
+    users_requested = list(users_requested)
+    if users_requested:
+        print(f"Reviewers already requested: {users_requested}")
+        return
+
+    locs_per_owner = Counter()
+    for file in pr.get_files():
+        owners = get_file_owners(file.filename, codeowners_lines)
+        for owner in owners:
+            locs_per_owner[owner] += file.changes
+
+    # Assign the top 2 based on locs changed as reviewers, but skip the owner if present
+    locs_per_owner.pop(pr_author, None)
+    top_owners = locs_per_owner.most_common(2)
+    print("Top owners", top_owners)
+    top_owners = [owner[0] for owner in top_owners]
+    try:
+        pr.create_review_request(top_owners)
+    except github.GithubException as e:
+        print(f"Failed to request review for {top_owners}: {e}")
+
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/codeowners_for_review_action
+++ b/.github/scripts/codeowners_for_review_action
@ -0,0 +1,370 @@
+# Top-level rules are matched only if nothing else matches
+* @Rocketknight1 @ArthurZucker # if no one is pinged based on the other rules, he will do the dispatch
+*.md @stevhliu
+*tokenization* @ArthurZucker
+docs/ @stevhliu
+/benchmark/ @McPatate
+/docker/ @ydshieh @ArthurZucker
+
+# More high-level globs catch cases when specific rules later don't apply
+/src/transformers/models/*/processing* @molbap @yonigozlan @qubvel
+/src/transformers/models/*/image_processing* @qubvel
+/src/transformers/models/*/image_processing_*_fast* @yonigozlan
+
+# Owners of subsections of the library
+/src/transformers/generation/ @gante
+/src/transformers/pipeline/ @Rocketknight1 @yonigozlan
+/src/transformers/integrations/ @SunMarc @MekkCyber @zach-huggingface
+/src/transformers/quantizers/ @SunMarc @MekkCyber
+tests/ @ydshieh
+tests/generation/ @gante
+
+/src/transformers/models/auto/ @ArthurZucker
+/src/transformers/utils/ @ArthurZucker @Rocketknight1
+/src/transformers/loss/ @ArthurZucker
+/src/transformers/onnx/ @michaelbenayoun
+
+# Specific files come after the sections/globs, so they take priority
+/.circleci/config.yml @ArthurZucker @ydshieh
+/utils/tests_fetcher.py @ydshieh
+trainer.py @zach-huggingface @SunMarc
+trainer_utils.py @zach-huggingface @SunMarc
+/utils/modular_model_converter.py @Cyrilvallez @ArthurZucker
+
+# Owners of individual models are specific / high priority, and so they come last
+# mod* captures modeling and modular files
+
+# Text models
+/src/transformers/models/albert/mod*_albert* @ArthurZucker
+/src/transformers/models/bamba/mod*_bamba* @ArthurZucker
+/src/transformers/models/bart/mod*_bart* @ArthurZucker
+/src/transformers/models/barthez/mod*_barthez* @ArthurZucker
+/src/transformers/models/bartpho/mod*_bartpho* @ArthurZucker
+/src/transformers/models/bert/mod*_bert* @ArthurZucker
+/src/transformers/models/bert_generation/mod*_bert_generation* @ArthurZucker
+/src/transformers/models/bert_japanese/mod*_bert_japanese* @ArthurZucker
+/src/transformers/models/bertweet/mod*_bertweet* @ArthurZucker
+/src/transformers/models/big_bird/mod*_big_bird* @ArthurZucker
+/src/transformers/models/bigbird_pegasus/mod*_bigbird_pegasus* @ArthurZucker
+/src/transformers/models/biogpt/mod*_biogpt* @ArthurZucker
+/src/transformers/models/blenderbot/mod*_blenderbot* @ArthurZucker
+/src/transformers/models/blenderbot_small/mod*_blenderbot_small* @ArthurZucker
+/src/transformers/models/bloom/mod*_bloom* @ArthurZucker
+/src/transformers/models/bort/mod*_bort* @ArthurZucker
+/src/transformers/models/byt5/mod*_byt5* @ArthurZucker
+/src/transformers/models/camembert/mod*_camembert* @ArthurZucker
+/src/transformers/models/canine/mod*_canine* @ArthurZucker
+/src/transformers/models/codegen/mod*_codegen* @ArthurZucker
+/src/transformers/models/code_llama/mod*_code_llama* @ArthurZucker
+/src/transformers/models/cohere/mod*_cohere* @ArthurZucker
+/src/transformers/models/cohere2/mod*_cohere2* @ArthurZucker
+/src/transformers/models/convbert/mod*_convbert* @ArthurZucker
+/src/transformers/models/cpm/mod*_cpm* @ArthurZucker
+/src/transformers/models/cpmant/mod*_cpmant* @ArthurZucker
+/src/transformers/models/ctrl/mod*_ctrl* @ArthurZucker
+/src/transformers/models/dbrx/mod*_dbrx* @ArthurZucker
+/src/transformers/models/deberta/mod*_deberta* @ArthurZucker
+/src/transformers/models/deberta_v2/mod*_deberta_v2* @ArthurZucker
+/src/transformers/models/dialogpt/mod*_dialogpt* @ArthurZucker
+/src/transformers/models/diffllama/mod*_diffllama* @ArthurZucker
+/src/transformers/models/distilbert/mod*_distilbert* @ArthurZucker
+/src/transformers/models/dpr/mod*_dpr* @ArthurZucker
+/src/transformers/models/electra/mod*_electra* @ArthurZucker
+/src/transformers/models/encoder_decoder/mod*_encoder_decoder* @ArthurZucker
+/src/transformers/models/ernie/mod*_ernie* @ArthurZucker
+/src/transformers/models/ernie_m/mod*_ernie_m* @ArthurZucker
+/src/transformers/models/esm/mod*_esm* @ArthurZucker
+/src/transformers/models/falcon/mod*_falcon* @ArthurZucker
+/src/transformers/models/falcon3/mod*_falcon3* @ArthurZucker
+/src/transformers/models/falcon_mamba/mod*_falcon_mamba* @ArthurZucker
+/src/transformers/models/fastspeech2_conformer/mod*_fastspeech2_conformer* @ArthurZucker
+/src/transformers/models/flan_t5/mod*_flan_t5* @ArthurZucker
+/src/transformers/models/flan_ul2/mod*_flan_ul2* @ArthurZucker
+/src/transformers/models/flaubert/mod*_flaubert* @ArthurZucker
+/src/transformers/models/fnet/mod*_fnet* @ArthurZucker
+/src/transformers/models/fsmt/mod*_fsmt* @ArthurZucker
+/src/transformers/models/funnel/mod*_funnel* @ArthurZucker
+/src/transformers/models/fuyu/mod*_fuyu* @ArthurZucker
+/src/transformers/models/gemma/mod*_gemma* @ArthurZucker
+/src/transformers/models/gemma2/mod*_gemma2* @ArthurZucker
+/src/transformers/models/glm/mod*_glm* @ArthurZucker
+/src/transformers/models/openai_gpt/mod*_openai_gpt* @ArthurZucker
+/src/transformers/models/gpt_neo/mod*_gpt_neo* @ArthurZucker
+/src/transformers/models/gpt_neox/mod*_gpt_neox* @ArthurZucker
+/src/transformers/models/gpt_neox_japanese/mod*_gpt_neox_japanese* @ArthurZucker
+/src/transformers/models/gptj/mod*_gptj* @ArthurZucker
+/src/transformers/models/gpt2/mod*_gpt2* @ArthurZucker
+/src/transformers/models/gpt_bigcode/mod*_gpt_bigcode* @ArthurZucker
+/src/transformers/models/gptsan_japanese/mod*_gptsan_japanese* @ArthurZucker
+/src/transformers/models/gpt_sw3/mod*_gpt_sw3* @ArthurZucker
+/src/transformers/models/granite/mod*_granite* @ArthurZucker
+/src/transformers/models/granitemoe/mod*_granitemoe* @ArthurZucker
+/src/transformers/models/herbert/mod*_herbert* @ArthurZucker
+/src/transformers/models/ibert/mod*_ibert* @ArthurZucker
+/src/transformers/models/jamba/mod*_jamba* @ArthurZucker
+/src/transformers/models/jetmoe/mod*_jetmoe* @ArthurZucker
+/src/transformers/models/jukebox/mod*_jukebox* @ArthurZucker
+/src/transformers/models/led/mod*_led* @ArthurZucker
+/src/transformers/models/llama/mod*_llama* @ArthurZucker @Cyrilvallez
+/src/transformers/models/longformer/mod*_longformer* @ArthurZucker
+/src/transformers/models/longt5/mod*_longt5* @ArthurZucker
+/src/transformers/models/luke/mod*_luke* @ArthurZucker
+/src/transformers/models/m2m_100/mod*_m2m_100* @ArthurZucker
+/src/transformers/models/madlad_400/mod*_madlad_400* @ArthurZucker
+/src/transformers/models/mamba/mod*_mamba* @ArthurZucker
+/src/transformers/models/mamba2/mod*_mamba2* @ArthurZucker
+/src/transformers/models/marian/mod*_marian* @ArthurZucker
+/src/transformers/models/markuplm/mod*_markuplm* @ArthurZucker
+/src/transformers/models/mbart/mod*_mbart* @ArthurZucker
+/src/transformers/models/mega/mod*_mega* @ArthurZucker
+/src/transformers/models/megatron_bert/mod*_megatron_bert* @ArthurZucker
+/src/transformers/models/megatron_gpt2/mod*_megatron_gpt2* @ArthurZucker
+/src/transformers/models/mistral/mod*_mistral* @ArthurZucker
+/src/transformers/models/mixtral/mod*_mixtral* @ArthurZucker
+/src/transformers/models/mluke/mod*_mluke* @ArthurZucker
+/src/transformers/models/mobilebert/mod*_mobilebert* @ArthurZucker
+/src/transformers/models/modernbert/mod*_modernbert* @ArthurZucker
+/src/transformers/models/mpnet/mod*_mpnet* @ArthurZucker
+/src/transformers/models/mpt/mod*_mpt* @ArthurZucker
+/src/transformers/models/mra/mod*_mra* @ArthurZucker
+/src/transformers/models/mt5/mod*_mt5* @ArthurZucker
+/src/transformers/models/mvp/mod*_mvp* @ArthurZucker
+/src/transformers/models/myt5/mod*_myt5* @ArthurZucker
+/src/transformers/models/nemotron/mod*_nemotron* @ArthurZucker
+/src/transformers/models/nezha/mod*_nezha* @ArthurZucker
+/src/transformers/models/nllb/mod*_nllb* @ArthurZucker
+/src/transformers/models/nllb_moe/mod*_nllb_moe* @ArthurZucker
+/src/transformers/models/nystromformer/mod*_nystromformer* @ArthurZucker
+/src/transformers/models/olmo/mod*_olmo* @ArthurZucker
+/src/transformers/models/olmo2/mod*_olmo2* @ArthurZucker
+/src/transformers/models/olmoe/mod*_olmoe* @ArthurZucker
+/src/transformers/models/open_llama/mod*_open_llama* @ArthurZucker
+/src/transformers/models/opt/mod*_opt* @ArthurZucker
+/src/transformers/models/pegasus/mod*_pegasus* @ArthurZucker
+/src/transformers/models/pegasus_x/mod*_pegasus_x* @ArthurZucker
+/src/transformers/models/persimmon/mod*_persimmon* @ArthurZucker
+/src/transformers/models/phi/mod*_phi* @ArthurZucker
+/src/transformers/models/phi3/mod*_phi3* @ArthurZucker
+/src/transformers/models/phimoe/mod*_phimoe* @ArthurZucker
+/src/transformers/models/phobert/mod*_phobert* @ArthurZucker
+/src/transformers/models/plbart/mod*_plbart* @ArthurZucker
+/src/transformers/models/prophetnet/mod*_prophetnet* @ArthurZucker
+/src/transformers/models/qdqbert/mod*_qdqbert* @ArthurZucker
+/src/transformers/models/qwen2/mod*_qwen2* @ArthurZucker
+/src/transformers/models/qwen2_moe/mod*_qwen2_moe* @ArthurZucker
+/src/transformers/models/rag/mod*_rag* @ArthurZucker
+/src/transformers/models/realm/mod*_realm* @ArthurZucker
+/src/transformers/models/recurrent_gemma/mod*_recurrent_gemma* @ArthurZucker
+/src/transformers/models/reformer/mod*_reformer* @ArthurZucker
+/src/transformers/models/rembert/mod*_rembert* @ArthurZucker
+/src/transformers/models/retribert/mod*_retribert* @ArthurZucker
+/src/transformers/models/roberta/mod*_roberta* @ArthurZucker
+/src/transformers/models/roberta_prelayernorm/mod*_roberta_prelayernorm* @ArthurZucker
+/src/transformers/models/roc_bert/mod*_roc_bert* @ArthurZucker
+/src/transformers/models/roformer/mod*_roformer* @ArthurZucker
+/src/transformers/models/rwkv/mod*_rwkv* @ArthurZucker
+/src/transformers/models/splinter/mod*_splinter* @ArthurZucker
+/src/transformers/models/squeezebert/mod*_squeezebert* @ArthurZucker
+/src/transformers/models/stablelm/mod*_stablelm* @ArthurZucker
+/src/transformers/models/starcoder2/mod*_starcoder2* @ArthurZucker
+/src/transformers/models/switch_transformers/mod*_switch_transformers* @ArthurZucker
+/src/transformers/models/t5/mod*_t5* @ArthurZucker
+/src/transformers/models/t5v1.1/mod*_t5v1.1* @ArthurZucker
+/src/transformers/models/tapex/mod*_tapex* @ArthurZucker
+/src/transformers/models/transfo_xl/mod*_transfo_xl* @ArthurZucker
+/src/transformers/models/ul2/mod*_ul2* @ArthurZucker
+/src/transformers/models/umt5/mod*_umt5* @ArthurZucker
+/src/transformers/models/xmod/mod*_xmod* @ArthurZucker
+/src/transformers/models/xglm/mod*_xglm* @ArthurZucker
+/src/transformers/models/xlm/mod*_xlm* @ArthurZucker
+/src/transformers/models/xlm_prophetnet/mod*_xlm_prophetnet* @ArthurZucker
+/src/transformers/models/xlm_roberta/mod*_xlm_roberta* @ArthurZucker
+/src/transformers/models/xlm_roberta_xl/mod*_xlm_roberta_xl* @ArthurZucker
+/src/transformers/models/xlm_v/mod*_xlm_v* @ArthurZucker
+/src/transformers/models/xlnet/mod*_xlnet* @ArthurZucker
+/src/transformers/models/yoso/mod*_yoso* @ArthurZucker
+/src/transformers/models/zamba/mod*_zamba* @ArthurZucker
+
+# Vision models
+/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel
+/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel
+/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel
+/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel
+/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel
+/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel
+/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel
+/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel
+/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel
+/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel
+/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel
+/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel
+/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel
+/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel
+/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel
+/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel
+/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel
+/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel
+/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel
+/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel
+/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel
+/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel
+/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel
+/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel
+/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel
+/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel
+/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel
+/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel
+/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel
+/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel
+/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel
+/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel
+/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel
+/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel
+/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel
+/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel
+/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel
+/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel
+/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel
+/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel
+/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel
+/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel
+/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel
+/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel
+/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel
+/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel
+/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel
+/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel
+/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel
+/src/transformers/models/van/mod*_van* @amyeroberts @qubvel
+/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel
+/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel
+/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel
+/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel
+/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel
+/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel
+/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel
+/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel
+/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel
+
+# Audio models
+/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb
+/src/transformers/models/bark/mod*_bark* @eustlb
+/src/transformers/models/clap/mod*_clap* @eustlb
+/src/transformers/models/dac/mod*_dac* @eustlb
+/src/transformers/models/encodec/mod*_encodec* @eustlb
+/src/transformers/models/hubert/mod*_hubert* @eustlb
+/src/transformers/models/mctct/mod*_mctct* @eustlb
+/src/transformers/models/mimi/mod*_mimi* @eustlb
+/src/transformers/models/mms/mod*_mms* @eustlb
+/src/transformers/models/moshi/mod*_moshi* @eustlb
+/src/transformers/models/musicgen/mod*_musicgen* @eustlb
+/src/transformers/models/musicgen_melody/mod*_musicgen_melody* @eustlb
+/src/transformers/models/pop2piano/mod*_pop2piano* @eustlb
+/src/transformers/models/seamless_m4t/mod*_seamless_m4t* @eustlb
+/src/transformers/models/seamless_m4t_v2/mod*_seamless_m4t_v2* @eustlb
+/src/transformers/models/sew/mod*_sew* @eustlb
+/src/transformers/models/sew_d/mod*_sew_d* @eustlb
+/src/transformers/models/speech_to_text/mod*_speech_to_text* @eustlb
+/src/transformers/models/speech_to_text_2/mod*_speech_to_text_2* @eustlb
+/src/transformers/models/speecht5/mod*_speecht5* @eustlb
+/src/transformers/models/unispeech/mod*_unispeech* @eustlb
+/src/transformers/models/unispeech_sat/mod*_unispeech_sat* @eustlb
+/src/transformers/models/univnet/mod*_univnet* @eustlb
+/src/transformers/models/vits/mod*_vits* @eustlb
+/src/transformers/models/wav2vec2/mod*_wav2vec2* @eustlb
+/src/transformers/models/wav2vec2_bert/mod*_wav2vec2_bert* @eustlb
+/src/transformers/models/wav2vec2_conformer/mod*_wav2vec2_conformer* @eustlb
+/src/transformers/models/wav2vec2_phoneme/mod*_wav2vec2_phoneme* @eustlb
+/src/transformers/models/wavlm/mod*_wavlm* @eustlb
+/src/transformers/models/whisper/mod*_whisper* @eustlb
+/src/transformers/models/xls_r/mod*_xls_r* @eustlb
+/src/transformers/models/xlsr_wav2vec2/mod*_xlsr_wav2vec2* @eustlb
+
+# Video models
+/src/transformers/models/timesformer/mod*_timesformer* @Rocketknight1
+/src/transformers/models/videomae/mod*_videomae* @Rocketknight1
+/src/transformers/models/vivit/mod*_vivit* @Rocketknight1
+
+# Multimodal models
+/src/transformers/models/align/mod*_align* @zucchini-nlp
+/src/transformers/models/altclip/mod*_altclip* @zucchini-nlp
+/src/transformers/models/aria/mod*_aria* @zucchini-nlp
+/src/transformers/models/blip/mod*_blip* @zucchini-nlp
+/src/transformers/models/blip_2/mod*_blip_2* @zucchini-nlp
+/src/transformers/models/bridgetower/mod*_bridgetower* @zucchini-nlp
+/src/transformers/models/bros/mod*_bros* @zucchini-nlp
+/src/transformers/models/chameleon/mod*_chameleon* @zucchini-nlp
+/src/transformers/models/chinese_clip/mod*_chinese_clip* @zucchini-nlp
+/src/transformers/models/clip/mod*_clip* @zucchini-nlp
+/src/transformers/models/clipseg/mod*_clipseg* @zucchini-nlp
+/src/transformers/models/clvp/mod*_clvp* @zucchini-nlp
+/src/transformers/models/colpali/mod*_colpali* @zucchini-nlp @yonigozlan
+/src/transformers/models/data2vec/mod*_data2vec* @zucchini-nlp
+/src/transformers/models/deplot/mod*_deplot* @zucchini-nlp
+/src/transformers/models/donut/mod*_donut* @zucchini-nlp
+/src/transformers/models/flava/mod*_flava* @zucchini-nlp
+/src/transformers/models/git/mod*_git* @zucchini-nlp
+/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel
+/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp
+/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp
+/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp
+/src/transformers/models/idefics3/mod*_idefics3* @zucchini-nlp
+/src/transformers/models/instructblip/mod*_instructblip* @zucchini-nlp
+/src/transformers/models/instructblipvideo/mod*_instructblipvideo* @zucchini-nlp
+/src/transformers/models/kosmos_2/mod*_kosmos_2* @zucchini-nlp
+/src/transformers/models/layoutlm/mod*_layoutlm* @NielsRogge
+/src/transformers/models/layoutlmv2/mod*_layoutlmv2* @NielsRogge
+/src/transformers/models/layoutlmv3/mod*_layoutlmv3* @NielsRogge
+/src/transformers/models/layoutxlm/mod*_layoutxlm* @NielsRogge
+/src/transformers/models/lilt/mod*_lilt* @zucchini-nlp
+/src/transformers/models/llava/mod*_llava* @zucchini-nlp @arthurzucker
+/src/transformers/models/llava_next/mod*_llava_next* @zucchini-nlp
+/src/transformers/models/llava_next_video/mod*_llava_next_video* @zucchini-nlp
+/src/transformers/models/llava_onevision/mod*_llava_onevision* @zucchini-nlp
+/src/transformers/models/lxmert/mod*_lxmert* @zucchini-nlp
+/src/transformers/models/matcha/mod*_matcha* @zucchini-nlp
+/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp
+/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp
+/src/transformers/models/nougat/mod*_nougat* @NielsRogge
+/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan
+/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp
+/src/transformers/models/owlvit/mod*_owlvit* @qubvel
+/src/transformers/models/owlv2/mod*_owlv2* @qubvel
+/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap
+/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp
+/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp
+/src/transformers/models/pixtral/mod*_pixtral* @zucchini-nlp @ArthurZucker
+/src/transformers/models/qwen2_audio/mod*_qwen2_audio* @zucchini-nlp @ArthurZucker
+/src/transformers/models/qwen2_vl/mod*_qwen2_vl* @zucchini-nlp @ArthurZucker
+/src/transformers/models/sam/mod*_sam* @zucchini-nlp @ArthurZucker
+/src/transformers/models/siglip/mod*_siglip* @zucchini-nlp
+/src/transformers/models/speech_encoder_decoder/mod*_speech_encoder_decoder* @zucchini-nlp
+/src/transformers/models/tapas/mod*_tapas* @NielsRogge
+/src/transformers/models/trocr/mod*_trocr* @zucchini-nlp
+/src/transformers/models/tvlt/mod*_tvlt* @zucchini-nlp
+/src/transformers/models/tvp/mod*_tvp* @zucchini-nlp
+/src/transformers/models/udop/mod*_udop* @zucchini-nlp
+/src/transformers/models/video_llava/mod*_video_llava* @zucchini-nlp
+/src/transformers/models/vilt/mod*_vilt* @zucchini-nlp
+/src/transformers/models/vipllava/mod*_vipllava* @zucchini-nlp
+/src/transformers/models/vision_encoder_decoder/mod*_vision_encoder_decoder* @Rocketknight1
+/src/transformers/models/vision_text_dual_encoder/mod*_vision_text_dual_encoder* @Rocketknight1
+/src/transformers/models/visual_bert/mod*_visual_bert* @zucchini-nlp
+/src/transformers/models/xclip/mod*_xclip* @zucchini-nlp
+
+# Reinforcement learning models
+/src/transformers/models/decision_transformer/mod*_decision_transformer* @Rocketknight1
+/src/transformers/models/trajectory_transformer/mod*_trajectory_transformer* @Rocketknight1
+
+# Time series models
+/src/transformers/models/autoformer/mod*_autoformer* @Rocketknight1
+/src/transformers/models/informer/mod*_informer* @Rocketknight1
+/src/transformers/models/patchtsmixer/mod*_patchtsmixer* @Rocketknight1
+/src/transformers/models/patchtst/mod*_patchtst* @Rocketknight1
+/src/transformers/models/time_series_transformer/mod*_time_series_transformer* @Rocketknight1
+
+# Graph models
+/src/transformers/models/graphormer/mod*_graphormer* @clefourrier
+
+# Finally, files with no owners that shouldn't generate pings, usually automatically generated and checked in the CI
+utils/dummy*
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/.github/workflows/assign-reviewers.yml
+++ b/.github/workflows/assign-reviewers.yml
@ -0,0 +1,26 @@
+name: Assign PR Reviewers
+on:
+  pull_request_target:
+    branches:
+      - main
+    types: [ready_for_review]
+
+jobs:
+  assign_reviewers:
+    permissions:
+       pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install PyGithub
+      - name: Run assignment script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python .github/scripts/assign_reviewers.py
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -63,14 +63,14 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build 
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-general-8-plus
+      group: aws-g4dn-2xlarge-cache
    steps:
      -
        name: Set up Docker Buildx
@ -99,7 +99,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build 
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -140,7 +140,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build 
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -176,7 +176,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-doc-builder docker build 
+          title: 🤗 Results of the huggingface/transformers-doc-builder docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -214,7 +214,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build 
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -223,19 +223,19 @@ jobs:
    runs-on:
      group: aws-general-8-plus
    steps:
-      - 
+      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      - 
+      -
        name: Check out code
        uses: actions/checkout@v4
-      - 
+      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      - 
+      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -263,7 +263,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build 
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -301,7 +301,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build 
+          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -310,19 +310,19 @@ jobs:
    runs-on:
      group: aws-general-8-plus
    steps:
-      - 
+      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      - 
+      -
        name: Check out code
        uses: actions/checkout@v4
-      - 
+      -
        name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      - 
+      -
        name: Build and push
        uses: docker/build-push-action@v5
        with:
@ -350,7 +350,7 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build 
+          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -388,6 +388,6 @@ jobs:
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-quantization-latest-gpu build 
+          title: 🤗 Results of the transformers-quantization-latest-gpu build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@ -42,7 +42,7 @@ jobs:
  nightly-torch-deepspeed-docker:
    name: "Nightly PyTorch + DeepSpeed"
    runs-on:
-      group: aws-general-8-plus
+      group: aws-g4dn-2xlarge-cache
    steps:
      -
        name: Set up Docker Buildx
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,5 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: ar de en es fr hi it ko pt tr zh ja te
-      custom_container: huggingface/transformers-doc-builder
+      languages: en
--- a/.github/workflows/change_pr_to_draft.yml
+++ b/.github/workflows/change_pr_to_draft.yml
@ -1,25 +0,0 @@
-name: Change PR to draft
-
-on:
-  pull_request_target:
-    types: [opened, reopened]
-
-jobs:
-  convert_pr_to_draft:
-    runs-on: ubuntu-22.04
-    name: Convert PR to draft
-    permissions:
-      pull-requests: write
-      contents: write
-    if: github.event.pull_request.draft == false
-    steps:
-      - name: Convert PR to draft
-        shell: bash
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO: ${{ github.repository }}
-        run: |
-          echo $PR_NUMBER
-          gh pr ready $PR_NUMBER --repo $REPO --undo
-          gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page)."
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -29,7 +29,7 @@ jobs:
  run_models_gpu:
    name: " "
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-2xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -18,6 +18,10 @@ on:
      docker:
        required: true
        type: string
+      report_name_prefix:
+        required: false
+        default: run_models_gpu
+        type: string

 env:
  HF_HOME: /mnt/cache
@ -103,7 +107,7 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -116,23 +120,23 @@ jobs:

      - name: Run all tests on GPU
        working-directory: /transformers
-        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt

      - name: Run test
        shell: bash
        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -59,7 +59,7 @@ jobs:
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
                  }
                }
              ]
--- a/.github/workflows/pr-style-bot.yml
+++ b/.github/workflows/pr-style-bot.yml
@ -0,0 +1,19 @@
+# To run this bot, comment "@bot /style" on a PR
+name: Style Bot
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  style:
+    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
+    with:
+      python_quality_dependencies: "[quality]"
+      style_command_type: "default"
+    secrets:
+      bot_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -27,7 +27,7 @@ jobs:

      - name: Get changed files
        id: changed-files
-        uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
+        uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
        with:
          files: src/transformers/models/**

--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -145,7 +145,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,7 +239,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,7 +338,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/self-push-caller.yml
+++ b/.github/workflows/self-push-caller.yml
@ -25,7 +25,7 @@ jobs:
        
        - name: Get changed files
          id: changed-files
-          uses: tj-actions/changed-files@v41
+          uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
        
        - name: Was setup changed 
          id: was_changed
@ -51,4 +51,4 @@ jobs:
    needs: build-docker-containers
    steps:
      - name: Trigger push CI via workflow_run
-        run: echo "Trigger push CI via workflow_run"
+        run: echo "Trigger push CI via workflow_run"
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -54,12 +54,23 @@ jobs:
      ci_event: Daily CI
    secrets: inherit

+  trainer-fsdp-ci:
+    name: Trainer/FSDP CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_trainer_and_fsdp_gpu
+      slack_report_channel: "#transformers-ci-daily-training"
+      runner: daily-ci
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+    secrets: inherit
+
  deepspeed-ci:
    name: DeepSpeed CI
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-deepspeed"
+      slack_report_channel: "#transformers-ci-daily-training"
      runner: daily-ci
      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
      ci_event: Daily CI
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -45,11 +45,11 @@ env:

 jobs:
  setup:
-    if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job)
+    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -77,12 +77,17 @@ jobs:
        run: pip freeze

      - id: set-matrix
-        if: ${{ inputs.job == 'run_models_gpu' }}
+        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
-          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
+            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
+            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
+          fi

      - id: set-matrix-quantization
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
@ -102,7 +107,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -113,13 +118,32 @@ jobs:
      docker: ${{ inputs.docker }}
    secrets: inherit

+  run_trainer_and_fsdp_gpu:
+    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        slice_id: [0, 1]
+    uses: ./.github/workflows/model_jobs.yml
+    with:
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      machine_type: ${{ matrix.machine_type }}
+      slice_id: ${{ matrix.slice_id }}
+      runner: ${{ inputs.runner }}
+      docker: ${{ inputs.docker }}
+      report_name_prefix: run_trainer_and_fsdp_gpu
+    secrets: inherit
+
  run_pipelines_torch_gpu:
    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: PyTorch pipelines
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -153,7 +177,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -187,7 +211,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -222,7 +246,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -256,7 +280,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -290,7 +314,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -325,7 +349,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -382,12 +406,12 @@ jobs:
        run: pip freeze

      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
+        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -424,7 +448,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -467,7 +491,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -541,6 +565,7 @@ jobs:
    needs: [
      setup,
      run_models_gpu,
+      run_trainer_and_fsdp_gpu,
      run_pipelines_torch_gpu,
      run_pipelines_tf_gpu,
      run_examples_gpu,
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -35,7 +35,7 @@ jobs:
        shell: bash
        run: |
          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
--- a/.github/workflows/update_metdata.yml
+++ b/.github/workflows/update_metdata.yml
@ -19,7 +19,7 @@ jobs:
      - name: Setup environment
        run: |
          pip install --upgrade pip
-          pip install datasets pandas==2.0.3
+          pip install datasets pandas
          pip install .[torch,tf,flax]

      - name: Update metadata
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers-cli env
+transformers env
 ```

 You can also run the same command from the root of the repository:
@ -221,10 +221,10 @@ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main
   [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.

   If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
-   make sure you install the documentation builder:
+   make sure you install the [documentation builder](https://github.com/huggingface/doc-builder).

   ```bash
-   pip install ".[docs]"
+   pip install hf-doc-builder
   ```

   Run the following command from the root of the repository:
--- a/ISSUES.md
+++ b/ISSUES.md
@ -26,7 +26,7 @@ There are two main venues to receive support: [the forums](https://discuss.huggi

 [The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.

-If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
+If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystallized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).

 In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:

@ -263,9 +263,9 @@ You are not required to read the following guidelines before opening an issue. H
    But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:

    ```
-    > How big is your gpu cluster?
+    > How big is your GPU cluster?

-    Our cluster is made of 256 gpus.
+    Our cluster is made of 256 GPUs.
    ```

    If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
--- a/5
+++ b/5
@ -37,7 +37,6 @@ autogenerate_code: deps_table_update
 repo-consistency:
 	python utils/check_copies.py
 	python utils/check_modular_conversion.py
-	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
@ -46,7 +45,6 @@ repo-consistency:
 	python utils/check_doctest_list.py
 	python utils/update_metadata.py --check-only
 	python utils/check_docstrings.py
-	python utils/check_support_list.py

 # this target runs checks on all files

@ -81,8 +79,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py  --fix_and_overwrite
-	python utils/check_table.py --fix_and_overwrite
+	python utils/check_modular_conversion.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
--- a/README.md
+++ b/README.md
@ -25,6 +25,7 @@ limitations under the License.
 </p>

 <p align="center">
+    <a href="https://huggingface.com/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a>
    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
    <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
@ -54,275 +55,258 @@ limitations under the License.
 </h4>

 <h3 align="center">
-    <p>State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow</p>
+    <p>State-of-the-art pretrained models for inference and training</p>
 </h3>

 <h3 align="center">
    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>

-🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
+Transformers is a library of pretrained text, computer vision, audio, video, and multimodal models for inference and training. Use Transformers to fine-tune models on your data, build inference applications, and for generative AI use cases across multiple modalities.

-These models can be applied on:
+There are over 500K+ Transformers [model checkpoints](https://huggingface.co/models?library=transformers&sort=trending) on the [Hugging Face Hub](https://huggingface.com/models) you can use.

-* 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, and text generation, in over 100 languages.
-* 🖼️ Images, for tasks like image classification, object detection, and segmentation.
-* 🗣️ Audio, for tasks like speech recognition and audio classification.
+Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away.

-Transformer models can also perform tasks on **several modalities combined**, such as table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
+## Installation

-🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.
+Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.

-🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.
+Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.

-## Online demos
-
-You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) for public and private models.
-
-Here are a few examples:
-
-In Natural Language Processing:
- [Masked word completion with BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
- [Named Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
- [Text generation with Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
- [Natural Language Inference with RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
- [Question answering with DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
- [Translation with T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
-
-In Computer Vision:
- [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224)
- [Object Detection with DETR](https://huggingface.co/facebook/detr-resnet-50)
- [Semantic Segmentation with SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
- [Panoptic Segmentation with Mask2Former](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
- [Depth Estimation with Depth Anything](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
- [Video Classification with VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
- [Universal Segmentation with OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
-
-In Audio:
- [Automatic Speech Recognition with Whisper](https://huggingface.co/openai/whisper-large-v3)
- [Keyword Spotting with Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
- [Audio Classification with Audio Spectrogram Transformer](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
-
-In Multimodal tasks:
- [Table Question Answering with TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
- [Visual Question Answering with ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
- [Image captioning with LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
- [Zero-shot Image Classification with SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384)
- [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
- [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
- [Zero-shot Object Detection with OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
- [Zero-shot Image Segmentation with CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)
- [Automatic Mask Generation with SAM](https://huggingface.co/docs/transformers/model_doc/sam)
-
-
-## 100 projects using Transformers
-
-Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the
-Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
-else to build their dream projects.
-
-In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the
-community, and we have created the [awesome-transformers](./awesome-transformers.md) page which lists 100
-incredible projects built in the vicinity of transformers.
-
-If you own or use a project that you believe should be part of the list, please open a PR to add it!
-
-## Serious about AI in your organisation? Build faster with the Hugging Face Enterprise Hub.
-
-<a target="_blank" href="https://huggingface.co/enterprise">
-    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
-</a><br>
-
-## Quick tour
-
-To immediately use a model on a given input (text, image, audio, ...), we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model's training. Here is how to quickly use a pipeline to classify positive versus negative texts:
-
-```python
->>> from transformers import pipeline
-
-# Allocate a pipeline for sentiment-analysis
->>> classifier = pipeline('sentiment-analysis')
->>> classifier('We are very happy to introduce pipeline to the transformers repository.')
-[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```py
+# venv
+python -m venv .my-env
+source .my-env/bin/activate
+# uv
+uv venv .my-env
+source .my-env/bin/activate
 ```

-The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here, the answer is "positive" with a confidence of 99.97%.
+Install Transformers in your virtual environment.

-Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image:
+```py
+# pip
+pip install "transformers[torch]"

-``` python
->>> import requests
->>> from PIL import Image
->>> from transformers import pipeline
-
-# Download an image with cute cats
->>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
->>> image_data = requests.get(url, stream=True).raw
->>> image = Image.open(image_data)
-
-# Allocate a pipeline for object detection
->>> object_detector = pipeline('object-detection')
->>> object_detector(image)
-[{'score': 0.9982201457023621,
-  'label': 'remote',
-  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
- {'score': 0.9960021376609802,
-  'label': 'remote',
-  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
- {'score': 0.9954745173454285,
-  'label': 'couch',
-  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
- {'score': 0.9988006353378296,
-  'label': 'cat',
-  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
- {'score': 0.9986783862113953,
-  'label': 'cat',
-  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+# uv
+uv pip install "transformers[torch]"
 ```

-Here, we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:
+Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
+
+```shell
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+
+# pip
+pip install .[torch]
+
+# uv
+uv pip install .[torch]
+```
+
+## Quickstart
+
+Get started with Transformers right away with the [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API. The `Pipeline` is a high-level inference class that supports text, audio, vision, and multimodal tasks. It handles preprocessing the input and returns the appropriate output.
+
+Instantiate a pipeline and specify model to use for text generation. The model is downloaded and cached so you can easily reuse it again. Finally, pass some text to prompt the model.
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B")
+pipeline("the secret to baking a really good cake is ")
+[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}]
+```
+
+To chat with a model, the usage pattern is the same. The only difference is you need to construct a chat history (the input to `Pipeline`) between you and the system.
+
+> [!TIP]
+> You can also chat with a model directly from the command line.
+> ```shell
+> transformers chat Qwen/Qwen2.5-0.5B-Instruct
+> ```
+
+```py
+import torch
+from transformers import pipeline
+
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
+```
+
+Expand the examples below to see how `Pipeline` works for different modalities and tasks.
+
+<details>
+<summary>Automatic speech recognition</summary>
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
+pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+</details>
+
+<details>
+<summary>Image classification</summary>

 <h3 align="center">
-    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
-    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a>
 </h3>

-You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary).
+```py
+from transformers import pipeline

-In addition to `pipeline`, to download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
-```python
->>> from transformers import AutoTokenizer, AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
->>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
-
->>> inputs = tokenizer("Hello world!", return_tensors="pt")
->>> outputs = model(**inputs)
+pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer")
+pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+[{'label': 'macaw', 'score': 0.997848391532898},
+ {'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
+  'score': 0.0016551691805943847},
+ {'label': 'lorikeet', 'score': 0.00018523589824326336},
+ {'label': 'African grey, African gray, Psittacus erithacus',
+  'score': 7.85409429227002e-05},
+ {'label': 'quail', 'score': 5.502637941390276e-05}]
 ```

-And here is the equivalent code for TensorFlow:
-```python
->>> from transformers import AutoTokenizer, TFAutoModel
+</details>

->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+<details>
+<summary>Visual question answering</summary>

->>> inputs = tokenizer("Hello world!", return_tensors="tf")
->>> outputs = model(**inputs)
+
+<h3 align="center">
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
+</h3>
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
+pipeline(
+    image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
+    question="What is in the image?",
+)
+[{'answer': 'statue of liberty'}]
 ```

-The tokenizer is responsible for all the preprocessing the pretrained model expects and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.
+</details>

-The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.
-
-## Why should I use transformers?
+## Why should I use Transformers?

 1. Easy-to-use state-of-the-art models:
-    - High performance on natural language understanding & generation, computer vision, and audio tasks.
-    - Low barrier to entry for educators and practitioners.
+    - High performance on natural language understanding & generation, computer vision, audio, video, and multimodal tasks.
+    - Low barrier to entry for researchers, engineers, and developers.
    - Few user-facing abstractions with just three classes to learn.
    - A unified API for using all our pretrained models.

 1. Lower compute costs, smaller carbon footprint:
-    - Researchers can share trained models instead of always retraining.
-    - Practitioners can reduce compute time and production costs.
-    - Dozens of architectures with over 400,000 pretrained models across all modalities.
+    - Share trained models instead of training from scratch.
+    - Reduce compute time and production costs.
+    - Dozens of model architectures with 1M+ pretrained checkpoints across all modalities.

-1. Choose the right framework for every part of a model's lifetime:
+1. Choose the right framework for every part of a models lifetime:
    - Train state-of-the-art models in 3 lines of code.
-    - Move a single model between TF2.0/PyTorch/JAX frameworks at will.
-    - Seamlessly pick the right framework for training, evaluation, and production.
+    - Move a single model between PyTorch/JAX/TF2.0 frameworks at will.
+    - Pick the right framework for training, evaluation, and production.

 1. Easily customize a model or an example to your needs:
    - We provide examples for each architecture to reproduce the results published by its original authors.
    - Model internals are exposed as consistently as possible.
    - Model files can be used independently of the library for quick experiments.

-## Why shouldn't I use transformers?
+<a target="_blank" href="https://huggingface.co/enterprise">
+    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
+</a><br>
+
+## Why shouldn't I use Transformers?

 - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)).
- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
+- The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
+- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.

-## Installation
+## 100 projects using Transformers

-### With pip
+Transformers is more than a toolkit to use pretrained models, it's a community of projects built around it and the
+Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
+else to build their dream projects.

-This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, and TensorFlow 2.6+.
+In order to celebrate Transformers 100,000 stars, we wanted to put the spotlight on the
+community with the [awesome-transformers](./awesome-transformers.md) page which lists 100
+incredible projects built with Transformers.

-You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+If you own or use a project that you believe should be part of the list, please open a PR to add it!

-First, create a virtual environment with the version of Python you're going to use and activate it.
+## Example models

-**macOS/Linux**
+You can test most of our models directly on their [Hub model pages](https://huggingface.co/models).

-```python -m venv env
-source env/bin/activate
-```
+Expand each modality below to see a few example models for various use cases.

-**Windows**
+<details>
+<summary>Audio</summary>

-``` python -m venv env
-env\Scripts\activate
-```
+- Audio classification with [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo)
+- Automatic speech recognition with [Moonshine](https://huggingface.co/UsefulSensors/moonshine)
+- Keyword spotting with [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- Speech to speech generation with [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16)
+- Text to audio with [MusicGen](https://huggingface.co/facebook/musicgen-large)
+- Text to speech with [Bark](https://huggingface.co/suno/bark)

-To use 🤗 Transformers, you must install at least one of Flax, PyTorch, or TensorFlow. Refer to the official installation guides for platform-specific commands:
+</details>

-[TensorFlow installation page](https://www.tensorflow.org/install/), 
-[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) 
+<details>
+<summary>Computer vision</summary>

-When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
+- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
+- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
+- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
+- Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
+- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue)
+- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
+- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
+- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
+- Video classification with [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large)

-```
-pip install transformers
-```
+</details>

-If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source).
+<details>
+<summary>Multimodal</summary>

-```
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install .
-```
+- Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B)
+- Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base)
+- Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
+- Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b)
+- OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
+- Table question answering with [TAPAS](https://huggingface.co/google/tapas-base)
+- Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen)
+- Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)
+- Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224)

-### With conda
+</details>

-🤗 Transformers can be installed using conda as follows:
+<details>
+<summary>NLP</summary>

-```shell script
-conda install conda-forge::transformers
-```
+- Masked word completion with [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base)
+- Named entity recognition with [Gemma](https://huggingface.co/google/gemma-2-2b)
+- Question answering with [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+- Summarization with [BART](https://huggingface.co/facebook/bart-large-cnn)
+- Translation with [T5](https://huggingface.co/google-t5/t5-base)
+- Text generation with [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B)
+- Text classification with [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B)

-> **_NOTE:_** Installing `transformers` from the `huggingface` channel is deprecated.
-
-Follow the installation pages of Flax, PyTorch or TensorFlow to see how to install them with conda.
-
-> **_NOTE:_**  On Windows, you may be prompted to activate Developer Mode in order to benefit from caching. If this is not an option for you, please let us know in [this issue](https://github.com/huggingface/huggingface_hub/issues/1062).
-
-## Model architectures
-
-**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models), where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
-
-Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
-
-🤗 Transformers currently provides the following architectures: see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them.
-
-To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
-
-These implementations have been tested on several datasets (see the example scripts) and should match the performance of the original implementations. You can find more details on performance in the Examples section of the [documentation](https://github.com/huggingface/transformers/tree/main/examples).
-
-
-## Learn more
-
-| Section | Description |
-|-|-|
-| [Documentation](https://huggingface.co/docs/transformers/) | Full API documentation and tutorials |
-| [Task summary](https://huggingface.co/docs/transformers/task_summary) | Tasks supported by 🤗 Transformers |
-| [Preprocessing tutorial](https://huggingface.co/docs/transformers/preprocessing) | Using the `Tokenizer` class to prepare data for the models |
-| [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
-| [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks |
-| [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community |
+</details>

 ## Citation

--- a/SECURITY.md
+++ b/SECURITY.md
@ -27,13 +27,6 @@ These models require the `trust_remote_code=True` parameter to be set when using
 the content of the modeling files when using this argument. We recommend setting a revision in order to ensure you
 protect yourself from updates on the repository.

-#### Tools
-
-Through the `Agent` framework, remote tools can be downloaded to be used by the Agent. You're to specify these tools
-yourself, but please keep in mind that their code will be run on your machine if the Agent chooses to run them.
-
-Please inspect the code of the tools before passing them to the Agent to protect your runtime and local setup.
-
 ## Reporting a Vulnerability

 Feel free to submit vulnerability reports to [security@huggingface.co](mailto:security@huggingface.co), where someone from the HF security team will review and recommend next steps. If reporting a vulnerability specific to open source, please note [Huntr](https://huntr.com) is a vulnerability disclosure program for open source software.
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -29,7 +29,7 @@ Keywords: inpainting, SD, Stable Diffusion

 ## [flair](https://github.com/flairNLP/flair)

-FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.
+FLAIR is a powerful PyTorch NLP framework, covering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.

 Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis

@ -47,7 +47,7 @@ Keywords: LLMs, Large Language Models, Agents, Chains

 ## [LlamaIndex](https://github.com/run-llama/llama_index)

-[LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retreival mechanisms to perform different LLM tasks and obtain knowledge-augmented results.
+[LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retrieval mechanisms to perform different LLM tasks and obtain knowledge-augmented results.

 Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation 

--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -12,7 +12,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

 ## Writing metrics to the database

-`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.

 cf [`llama.py`](./llama.py) to see an example of this in practice.

--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):

        model = benchmark.config.backend["model"]

-        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
        benchmark_name = str(Path(benchmark_name).parts[-1])
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -3,7 +3,6 @@ import importlib.util
 import logging
 import os
 from typing import Dict
-import psycopg2
 import sys

 from psycopg2.extras import Json
@ -136,7 +135,7 @@ if __name__ == "__main__":
                continue
            logger.debug(f"loading: {entry.name}")
            module = import_from_path(entry.name.split(".")[0], entry.path)
-            logger.info(f"runnning benchmarks in: {entry.name}")
+            logger.info(f"running benchmarks in: {entry.name}")
            module.run_benchmark(logger, branch, commit_id, commit_msg)
        except ImportModuleException as e:
            logger.error(e)
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -118,7 +118,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
        with torch.no_grad():
            past_key_values = StaticCache(
                model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + num_tokens_to_generate,
@ -144,7 +144,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + num_tokens_to_generate,
@ -187,7 +187,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            # TODO use  decode_one_token(model, input_id.clone(), cache_position) for verification
            past_key_values = StaticCache(
                model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + num_tokens_to_generate + 10,
@ -204,7 +204,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            time_to_first_token = end - start
            logger.info(f"completed first compile generation in: {time_to_first_token}s")
            cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()

            cache_position = torch.tensor([seq_length], device=device)
            ### First compile, decoding
@ -215,9 +215,9 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            torch.cuda.synchronize()
            end = perf_counter()
            time_to_second_token = end - start
-            logger.info(f"completed second compile generation in: {time_to_first_token}s")
+            logger.info(f"completed second compile generation in: {time_to_second_token}s")
            cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()

            ### Second compile, decoding
            start = perf_counter()
@ -227,15 +227,15 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            torch.cuda.synchronize()
            end = perf_counter()
            time_to_third_token = end - start
-            logger.info(f"completed third compile forward in: {time_to_first_token}s")
+            logger.info(f"completed third compile forward in: {time_to_third_token}s")
            cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()

            ### Using cuda graphs decoding

            start = perf_counter()
            for _ in range(1, num_tokens_to_generate):
-                all_generated_tokens += next_token.clone().detach().cpu().tolist()
+                all_generated_tokens += next_token.tolist()
                next_token = decode_one_token(
                    model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
                )
@ -254,7 +254,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
@ -271,7 +271,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
@ -287,23 +287,23 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,

            past_key_values = StaticCache(
                model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
            )

-            # 3nd call
+            # 3rd call
            start = perf_counter()
            output = model.generate(**inputs, past_key_values=past_key_values)
            end = perf_counter()
            third_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {third_compile_generate_time}s")
+            logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")

            past_key_values = StaticCache(
                model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                device=device,
                dtype=torch.float16,
                max_cache_len=seq_length + 128,
@ -313,7 +313,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
            output = model.generate(**inputs, past_key_values=past_key_values)
            end = perf_counter()
            fourth_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
+            logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")

        metrics_recorder.collect_model_measurements(
--- a/conftest.py
+++ b/conftest.py
@ -46,10 +46,6 @@ NOT_DEVICE_TESTS = {
    "test_keep_in_fp32_modules",
    "test_gradient_checkpointing_backward_compatibility",
    "test_gradient_checkpointing_enable_disable",
-    "test_save_load_fast_init_from_base",
-    "test_fast_init_context_manager",
-    "test_fast_init_tied_embeddings",
-    "test_save_load_fast_init_to_base",
    "test_torch_save_load",
    "test_initialization",
    "test_forward_signature",
@ -70,7 +66,6 @@ NOT_DEVICE_TESTS = {
    "ModelTester::test_pipeline_",
    "/repo_utils/",
    "/utils/",
-    "/agents/",
 }

 # allow having multiple repository checkouts and not needing to remember to rerun
@ -87,7 +82,6 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
-    config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")


--- a/docker/README.md
+++ b/docker/README.md
@ -2,8 +2,8 @@

 In this folder you will find various docker files, and some subfolders. 
 - dockerfiles (ex: `consistency.dockerfile`) present under `~/docker` are used for our "fast" CIs. You should be able to use them for tasks that only need CPU. For example `torch-light` is a very light weights container (703MiB). 
- subfloder contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)
+- subfolders contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)

 Note that in both case, you need to run `uv pip install -e .`, which should take around 5 seconds. We do it outside the dockerfile for the need of our CI: we checkout a new branch each time, and the `transformers` code is thus updated. 

-We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
+We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,12 +5,12 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
 RUN git lfs install

-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -1,5 +1,6 @@
 FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
 ENV UV_PYTHON=/usr/local/bin/python
@ -15,12 +16,12 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install  --no-cache-dir "transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
+RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
 RUN python3 -m unidic download
-RUN pip uninstall -y transformers
+RUN uv pip uninstall transformers

 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
-RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
+RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@ -1,12 +1,13 @@
 FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
 RUN apt-get install -y g++ cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
-RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
-RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -1,11 +1,12 @@
 FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,13 +5,13 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
-RUN pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
+RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
 # RUN git clone https://github.com/facebookresearch/detectron2.git
 # RUN python3 -m pip install --no-cache-dir -e detectron2
-RUN pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3'
-RUN pip uninstall -y transformers
+RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
+RUN uv pip uninstall transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@ -5,6 +5,6 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@ -5,6 +5,6 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
-RUN pip uninstall -y transformers
+RUN uv pip uninstall transformers
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@ -6,4 +6,4 @@ RUN apt-get update && apt-get install -y time git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv &&  uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
-RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@ -6,7 +6,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 RUN apt-get install -y  cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
-RUN pip uninstall -y transformers
+RUN uv pip uninstall transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@ -6,11 +6,11 @@ RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"


 # RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"

-RUN pip uninstall -y transformers
+RUN uv pip uninstall transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words]"
-RUN pip uninstall -y transformers
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
+RUN uv pip uninstall transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,13 +7,13 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
-RUN pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" librosa


-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -14,6 +14,8 @@ ARG PYTORCH='2.6.0'
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu121'
+# Disable kernel mapping for now until all tests pass
+ENV DISABLE_KERNEL_MAPPING=1

 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
@ -57,7 +59,8 @@ RUN python3 -m pip uninstall -y ninja

 # For `dinat` model
 # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
-RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
+# pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'`
+RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels

 # For `nougat` tokenizer
 RUN python3 -m pip install --no-cache-dir python-Levenshtein
@ -68,6 +71,9 @@ RUN python3 -m pip install --no-cache-dir g2p-en
 # For Some bitsandbytes tests
 RUN python3 -m pip install --no-cache-dir einops

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -1,12 +1,12 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
-FROM nvcr.io/nvidia/pytorch:23.11-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

-ARG PYTORCH='2.2.0'
+ARG PYTORCH='2.6.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu121'
+ARG CUDA='cu126'

 RUN apt -y update
 RUN apt install -y libaio-dev
@ -15,7 +15,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
+RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'

 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
@ -44,6 +45,9 @@ RUN python3 -m pip uninstall -y deepspeed
 # TODO: Find out why test fail.
 RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -1,11 +1,11 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
-FROM nvcr.io/nvidia/pytorch:23.11-py3
+FROM nvcr.io/nvidia/pytorch:24.08-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu121'
+ARG CUDA='cu126'

 RUN apt -y update
 RUN apt install -y libaio-dev
@ -21,7 +21,8 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
 RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

-RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
+RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -56,6 +57,9 @@ RUN python3 -m pip uninstall -y deepspeed
 #RUN git clone https://github.com/pytorch/TensorRT.git
 #RUN cd TensorRT/py && python3 setup.py install --fx-only

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -28,6 +28,9 @@ RUN python3 -m pip uninstall -y tensorflow flax
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -9,9 +9,11 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.5.1'
+ARG PYTORCH='2.6.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu118'
+ARG CUDA='cu121'
+# Disable kernel mapping for quantization tests
+ENV DISABLE_KERNEL_MAPPING=1

 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
@ -26,8 +28,6 @@ RUN echo torch=$VERSION
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
 RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
-
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

 # needed in bnb and awq
@ -36,10 +36,9 @@ RUN python3 -m pip install --no-cache-dir einops
 # Add bitsandbytes for mixed int8 testing
 RUN python3 -m pip install --no-cache-dir bitsandbytes

-# Add auto-gptq for gtpq quantization testing, installed from source for pytorch==2.5.1 compatibility
-# TORCH_CUDA_ARCH_LIST="7.5+PTX" is added to make the package compile for Tesla T4 gpus available for the CI.
-RUN pip install gekko
-RUN git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ && TORCH_CUDA_ARCH_LIST="7.5+PTX" python3 setup.py install
+# Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility
+RUN python3 -m pip install lm_eval
+RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation

 # Add optimum for gptq quantization testing
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
@ -51,10 +50,11 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
 RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2

 # Add vptq for quantization testing
-RUN python3 -m pip install --no-cache-dir vptq
+RUN pip install vptq

 # Add spqr for quantization testing
-RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
+# Commented for now as No matching distribution found we need to reach out to the authors
+# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]

 # Add hqq for quantization testing
 RUN python3 -m pip install --no-cache-dir hqq
@ -63,22 +63,36 @@ RUN python3 -m pip install --no-cache-dir hqq
 RUN python3 -m pip install --no-cache-dir gguf

 # Add autoawq for quantization testing
-# >=v0.2.7 needed for compatibility with transformers > 4.46
-RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl
+# New release v0.2.8
+RUN python3 -m pip install --no-cache-dir autoawq[kernels]

 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto

 # Add eetq for quantization testing
-RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
+RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .

-# Add flute-kernel and fast_hadamard_transform for quantization testing
-RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118
-RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1
+# # Add flute-kernel and fast_hadamard_transform for quantization testing
+# # Commented for now as they cause issues with the build
+# # TODO: create a new workflow to test them
+# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
+# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git

 # Add compressed-tensors for quantization testing
 RUN python3 -m pip install --no-cache-dir compressed-tensors

+# Add AMD Quark for quantization testing
+RUN python3 -m pip install --no-cache-dir amd-quark
+
+# Add AutoRound for quantization testing
+RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
+
+# Add transformers in editable mode
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
+
+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -23,8 +23,6 @@
    title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT
  - local: model_sharing
    title: مشاركة نموذجك
-  - local: agents
-    title: الوكلاء
  - local: llm_tutorial
    title: التوليد باستخدام LLMs
  - local: conversations
@ -252,8 +250,6 @@
  title: أطر مفاهيمية
 # - sections:
 #   - sections:
-#     - local: main_classes/agent
-#       title: الوكلاء والأدوات
 #     - local: model_doc/auto
 #       title: فئات يتم إنشاؤها ديناميكيًا
 #     - local: main_classes/backbones
--- a/docs/source/ar/agents.md
+++ b/docs/source/ar/agents.md
@ -1,539 +0,0 @@
-# الوكلاء والأدوات
-
-[[open-in-colab]]
-
-### ما هو الوكيل؟
-
-يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها.
-
-يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل".
-
-الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات".
-
-هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح.
-
-يمكن برمجة الوكيل للقيام بما يلي:
- وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال
- التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال
-
-### أنواع الوكلاء
-
-#### الوكيل البرمجي (Code agent)
-
-يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط.
-
-#### وكلاء التفاعل
-
-هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة.
-
-نقوم بتنفيذ إصدارين من ReactJsonAgent: 
- [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها.
- [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء  قوي في البرمجة.
-
-> [!TIP]
-> اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct.
-
-![إطار عمل وكيل ReAct](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
-
-على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي.
-
-```py3
->>> agent.run(
-...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
-... )
-=====New task=====
-How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
-====Agent is executing the code below:
-bert_blocks = search(query="number of blocks in BERT base encoder")
-print("BERT blocks:", bert_blocks)
-====
-Print outputs:
-BERT blocks: twelve encoder blocks
-
-====Agent is executing the code below:
-attention_layer = search(query="number of layers in Attention is All You Need")
-print("Attention layers:", attention_layer)
-====
-Print outputs:
-Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
-
-====Agent is executing the code below:
-bert_blocks = 12
-attention_layers = 6
-diff = bert_blocks - attention_layers
-print("Difference in blocks:", diff)
-final_answer(diff)
-====
-
-Print outputs:
-Difference in blocks: 6
-
-Final answer: 6
-```
-
-### كيف يمكنني بناء وكيل؟
-
-لتهيئة وكيل، تحتاج إلى هذه الوسائط:
-
- نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له.
- موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته.
- صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها
- محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها
-
-عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا.
-
-للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية.
-
-```bash
-pip install transformers[agents]
-```
-
-قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد.
-
-```python
-from huggingface_hub import login, InferenceClient
-
-login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
-
-client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
-
-def llm_engine(messages, stop_sequences=["Task"]) -> str:
-    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
-    answer = response.choices[0].message.content
-    return answer
-```
-
-يمكنك استخدام أي طريقة `llm_engine` طالما أنها:
-1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str`
-2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop`
-
-أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`.
-
-الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى.
-
-```python
-from transformers import CodeAgent, HfEngine
-
-llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run(
-    "Could you translate this sentence from French, say it out loud and return the audio.",
-    sentence="Où est la boulangerie la plus proche?",
-)
-```
-
-هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي.
-
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], add_base_tools=True)
-
-agent.run(
-    "Could you translate this sentence from French, say it out loud and give me the audio.",
-    sentence="Où est la boulangerie la plus proche?",
-)
-```
-
-لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج.
-
-يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها:
-
-```py
-from transformers import ReactCodeAgent
-
-agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
-```
-
-
-تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك.
-
-```python
-print(agent.system_prompt_template)
-```
-
-من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها.
-كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا.
-يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`.
-
-
-#### تنفيذ التعليمات البرمجية
-
-يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك.
-يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه.
-
-مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة.
-يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل  `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]:
-
-```py
->>> from transformers import ReactCodeAgent
-
->>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
->>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
-
-(...)
-'Hugging Face – Blog'
-```
-
-سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل.
-
-> [!WARNING]
-> يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة!
-
-### موجه النظام
-
-ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً).
-
-```text
-You will be given a task to solve as best you can.
-You have access to the following tools:
-<<tool_descriptions>>
-
-To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
-
-At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
-Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence.
-During each intermediate step, you can use 'print()' to save whatever important information you will then need.
-These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
-
-In the end you have to return a final answer using the `final_answer` tool.
-
-Here are a few examples using notional tools:
---
-{examples}
-
-Above example were using notional tools that might not exist for you. You only have acces to those tools:
-<<tool_names>>
-You also can perform computations in the python code you generate.
-
-Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
-
-Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
-Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
-
-Remember to make sure that variables you use are all defined.
-
-Now Begin!
-```
-
-يتضمن موجه النظام:
- *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها.
- وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<<tool_descriptions>>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها.
-    - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه.
- شكل المخرج المتوقع.
-
-يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات.
-
-للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`.
-
-```python
-from transformers import ReactJsonAgent
-from transformers.agents import PythonInterpreterTool
-
-agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
-```
-
-> [!WARNING]
-> يرجى التأكد من تحديد سلسلة `<<tool_descriptions>>` في مكان ما في `template` حتى يكون الوكيل على علم 
-بالأدوات المتاحة.
-
-
-### فحص تشغيل الوكيل
-
-فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل:
- تخزن  `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`.
- تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة.
-
-## الأدوات
-
-الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة.
-
-يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة.
-
-عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا.
-
-### صندوق الأدوات الافتراضي
-
-يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`:
-
- **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut))
- **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt))
- **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper))
- **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5))
- **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف.
- **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python
-لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS:
-
-يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها.
-
-```python
-from transformers import load_tool
-
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
-```
-
-### إنشاء أداة جديدة
-
-يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face.
-على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub.
-
-سوف نبدأ بالكود التالي.
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-
-يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`].
-
-تحتاج الأداة المخصصة إلى:
-
- اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`.
- تستخدم خاصية `description` لملء موجه نظام الوكيل.
- خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات.
- خاصية `output_type`، والتي تحدد نوع المخرج.
- طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية.
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = (
-        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
-        "It returns the name of the checkpoint."
-    )
-
-    inputs = {
-        "task": {
-            "type": "text",
-            "description": "the task category (such as text-classification, depth-estimation, etc)",
-        }
-    }
-    output_type = "text"
-
-    def forward(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام.
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة.
-
-```python
-tool.push_to_hub("{your_username}/hf-model-downloads")
-```
-
-قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك.
-
-```python
-from transformers import load_tool, CodeAgent
-
-model_download_tool = load_tool("m-ric/hf-model-downloads")
-agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
-agent.run(
-    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
-)
-```
-
-ستحصل على ما يلي:
-
-```text
-======== New task ========
-Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
-==== Agent is executing the code below:
-most_downloaded_model = model_download_counter(task="text-to-video")
-print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
-====
-```
-
-والناتج:
-
-`"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."`
-
-### إدارة صندوق أدوات الوكيل الخاص بك
-
-إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة.
-
-دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي.
-
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-agent.toolbox.add_tool(model_download_tool)
-```
-
-الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة:
-
-```python
-    agent.run(
-        "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
-    )
-```
-
-| **Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-
-> [!WARNING]
-> احترس عند إضافة أدوات إلى وكيل يعمل بالفعل لأنه يمكن أن يؤثر على اختيار الأداة لصالح أداتك أو اختيار أداة أخرى غير المحددة بالفعل.
-
-استخدم طريقة `agent.toolbox.update_tool()` لاستبدال أداة موجودة في صندوق أدوات الوكيل.
-هذا مفيد إذا كانت أداتك الجديدة بديلاً مباشرًا للأداة الموجودة لأن الوكيل يعرف بالفعل كيفية تنفيذ تلك المهمة المحددة.
-تأكد فقط من اتباع الأداة الجديدة لنفس واجهة برمجة التطبيقات (API) للأداة المستبدلة أو قم بتكييف قالب موجه النظام لضمان تحديث جميع الأمثلة التي تستخدم الأداة المستبدلة.
-
-### استخدام مجموعة من الأدوات
-
-يمكنك الاستفادة من مجموعات الأدوات باستخدام كائن ToolCollection، مع تحديد مجموعة الأدوات التي تريد استخدامها.
-ثم قم بتمريرها كقائمة لتهيئة الوكيل الخاص بك، وبدء استخدامها!
-
-```py
-from transformers import ToolCollection, ReactCodeAgent
-
-image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
-agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
-
-agent.run("Please draw me a picture of rivers and lakes.")
-```
-
-لتسريع البداية، يتم تحميل الأدوات فقط إذا استدعاها الوكيل.
-
-ستحصل على هذه الصورة:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" />
-
-### استخدام gradio-tools
-
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools) هي مكتبة قوية تتيح استخدام Hugging
-Face Spaces كأدوات. تدعم العديد من المساحات الموجودة بالإضافة إلى مساحات مخصصة.
-
-تدعم مكتبة Transformers `gradio_tools` باستخدام طريقة [`Tool.from_gradio`] في الفئة. على سبيل المثال، دعنا نستخدم [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) من مجموعة أدوات `gradio-tools` لتحسين المطالبات لإنشاء صور أفضل.
-
-استورد وقم بتهيئة الأداة، ثم مررها إلى طريقة `Tool.from_gradio`:
-
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-from transformers import Tool, load_tool, CodeAgent
-
-gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
-prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
-```
-
-الآن يمكنك استخدامه مثل أي أداة أخرى. على سبيل المثال، دعنا نحسن الموجه `a rabbit wearing a space suit`.
-
-```python
-image_generation_tool = load_tool('huggingface-tools/text-to-image')
-agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine)
-
-agent.run(
-    "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
-)
-```
-
-يستفيد النموذج بشكل كافٍ من الأداة:
-
-```text
-======== New task ========
-Improve this prompt, then generate an image of it.
-You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}.
-==== Agent is executing the code below:
-improved_prompt = StableDiffusionPromptGenerator(query=prompt)
-while improved_prompt == "QUEUE_FULL":
-    improved_prompt = StableDiffusionPromptGenerator(query=prompt)
-print(f"The improved prompt is {improved_prompt}.")
-image = image_generator(prompt=improved_prompt)
-====
-```
-
-قبل إنشاء الصورة أخيرًا:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp" />
-
-> [!WARNING]
-> تتطلب gradio-tools إدخالات وإخراجات *نصية* حتى عند العمل مع طرائق مختلفة مثل كائنات الصور والصوت. الإدخالات والإخراجات الصورية والصوتية غير متوافقة حاليًا.
-
-### استخدام أدوات LangChain
-
-نحن نحب Langchain ونعتقد أنها تحتوي على مجموعة أدوات قوية للغاية.
-لاستيراد أداة من LangChain، استخدم الطريقة `from_langchain()`.
-
-فيما يلي كيفية استخدامها لإعادة إنشاء نتيجة البحث في المقدمة باستخدام أداة بحث الويب LangChain.
-
-```python
-from langchain.agents import load_tools
-from transformers import Tool, ReactCodeAgent
-
-search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
-
-agent = ReactCodeAgent(tools=[search_tool])
-
-agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
-```
-
-## واجهة Gradio
-
-يمكنك الاستفادة من `gradio.Chatbot` لعرض أفكار الوكيل الخاص بك باستخدام `stream_to_gradio`، إليك مثال:
-
-```py
-import gradio as gr
-from transformers import (
-    load_tool,
-    ReactCodeAgent,
-    HfEngine,
-    stream_to_gradio,
-)
-
-# Import tool from Hub
-image_generation_tool = load_tool("m-ric/text-to-image")
-
-llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
-
-# Initialize the agent with the image generation tool
-agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
-
-
-def interact_with_agent(task):
-    messages = []
-    messages.append(gr.ChatMessage(role="user", content=task))
-    yield messages
-    for msg in stream_to_gradio(agent, task):
-        messages.append(msg)
-        yield messages + [
-            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
-        ]
-    yield messages
-
-
-with gr.Blocks() as demo:
-    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
-    submit = gr.Button("Run illustrator agent!")
-    chatbot = gr.Chatbot(
-        label="Agent",
-        type="messages",
-        avatar_images=(
-            None,
-            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
-        ),
-    )
-    submit.click(interact_with_agent, [text_input], [chatbot])
-
-if __name__ == "__main__":
-    demo.launch()
-```
--- a/docs/source/ar/bertology.md
+++ b/docs/source/ar/bertology.md
@ -15,4 +15,4 @@
 - الوصول إلى جميع أوزان الانتباه لكل رأس في BERT/GPT/GPT-2،
 - استرجاع قيم ومشتقات  مخرجات الرأس لحساب درجة أهمية الرأس وحذفه كما هو موضح في https://arxiv.org/abs/1905.10650.

-ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE.
+ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE.
--- a/docs/source/ar/gguf.md
+++ b/docs/source/ar/gguf.md
@ -77,7 +77,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)

 الآن لديك إمكانية الوصول إلى النسخة الكامل غير المكممة للنموذج في بيئة PyTorch، حيث يمكنك دمجه مع مجموعة كبيرة من الأدوات الأخرى.

-لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) من llama.cpp.
+لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) من llama.cpp.

 فيما يلي كيفية إكمال البرنامج النصي أعلاه لحفظ النموذج وإعادة تصديره مرة أخرى إلى `gguf`:

--- a/docs/source/ar/run_scripts.md
+++ b/docs/source/ar/run_scripts.md
@ -2,7 +2,7 @@

 بالإضافة إلى دفاتر الملاحظات [notebooks](./notebooks) الخاصة بـ 🤗 Transformers، هناك أيضًا نصوص برمجية توضيحية تُظهر كيفية تدريب نموذج لمهمة باستخدام [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch) أو [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) أو [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).

-كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers/tree/main/examples/research_projects) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة.
+كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers-research-projects/) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة.

 لا يُتوقع أن تعمل النصوص البرمجية التوضيحية بشكل مباشر على كل مشكلة، وقد تحتاج إلى تكييف النص البرمجي مع المشكلة التي تحاول حلها. ولمساعدتك في ذلك، تعرض معظم النصوص البرمجية كيفية معالجة البيانات قبل التدريب بشكل كامل، مما يتيح لك تحريرها حسب الحاجة لحالتك الاستخدام.

--- a/docs/source/ar/serialization.md
+++ b/docs/source/ar/serialization.md
@ -116,11 +116,11 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s

 <Tip warning={true}>

-لم يعد يتم دعم `tranformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
+لم يعد يتم دعم `transformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.

 </Tip>

-لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `tranformers.onnx`، ثبّت التبعيات الإضافية:
+لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية:

 ```bash
 pip install transformers[onnx]
--- a/docs/source/ar/trainer.md
+++ b/docs/source/ar/trainer.md
@ -674,29 +674,7 @@ use_cpu: false
 ```

 </hfoption>
-<hfoption id="Tensor Parallelism with PyTorch 2">

-```yml
-compute_environment: LOCAL_MACHINE
-tp_config:
-  tp_size: 4
-distributed_type: TP
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'no'
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-
-```
-
-</hfoption>
 </hfoptions>
 يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.

--- a/docs/source/de/_toctree.yml
+++ b/docs/source/de/_toctree.yml
@ -23,8 +23,6 @@
    title: Laden und Trainieren von Adaptern mit 🤗 PEFT
  - local: model_sharing
    title: Ein Modell teilen
-  - local: transformers_agents
-    title: Agents
  - local: llm_tutorial
    title: Generation with LLMs
  title: Tutorials
@ -39,4 +37,4 @@
    title: Testen
  - local: pr_checks
    title: Überprüfung einer Pull Request
-  title: Contribute
+  title: Contribute
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -95,7 +95,7 @@ wie der Code geschrieben werden sollte :-)
 1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from).
 2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
@ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d
 ein bestehendes Modell:

 ```bash
-transformers-cli add-new-model-like
+transformers add-new-model-like
 ```

 Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb
 Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:

 ```bash
-transformers-cli env
+transformers env
 ```

 Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
--- a/docs/source/de/index.md
+++ b/docs/source/de/index.md
@ -88,7 +88,7 @@ Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen,
 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
--- a/docs/source/de/quicktour.md
+++ b/docs/source/de/quicktour.md
@ -156,7 +156,7 @@ Die [`pipeline`] kann jedes Modell aus dem [Model Hub](https://huggingface.co/mo

 <frameworkcontent>
 <pt>
-Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below):
+Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `AutoClass` below):

 ```py
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
@ -166,7 +166,7 @@ Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the
 ```
 </pt>
 <tf>
-Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below):
+Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `TFAutoClass` below):

 ```py
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
@ -222,7 +222,7 @@ Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als
 Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:

 * [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
-* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
+* [attention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.

 Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:

--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 Neben den 🤗 Transformers [notebooks](./notebooks) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert.

-Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.
+Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers-research-projects/) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.

 Es wird nicht erwartet, dass die Beispielskripte bei jedem Problem sofort funktionieren. Möglicherweise müssen Sie das Skript an das Problem anpassen, das Sie zu lösen versuchen. Um Ihnen dabei zu helfen, legen die meisten Skripte vollständig offen, wie die Daten vorverarbeitet werden, so dass Sie sie nach Bedarf für Ihren Anwendungsfall bearbeiten können.

--- a/docs/source/de/transformers_agents.md
+++ b/docs/source/de/transformers_agents.md
@ -1,323 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Transformers Agents
-
-<Tip warning={true}>
-
-Transformers Agents ist eine experimentelle API, die jederzeit geändert werden kann. Die von den Agenten zurückgegebenen Ergebnisse
-zurückgegeben werden, können variieren, da sich die APIs oder die zugrunde liegenden Modelle ändern können.
-
-</Tip>
-
-Transformers Version v4.29.0, die auf dem Konzept von *Tools* und *Agenten* aufbaut. Sie können damit spielen in
-[dieses Colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
-
-Kurz gesagt, es bietet eine API für natürliche Sprache auf der Grundlage von Transformers: Wir definieren eine Reihe von kuratierten Tools und entwerfen einen 
-Agenten, um natürliche Sprache zu interpretieren und diese Werkzeuge zu verwenden. Es ist von vornherein erweiterbar; wir haben einige relevante Tools kuratiert, 
-aber wir werden Ihnen zeigen, wie das System einfach erweitert werden kann, um jedes von der Community entwickelte Tool zu verwenden.
-
-Beginnen wir mit einigen Beispielen dafür, was mit dieser neuen API erreicht werden kann. Sie ist besonders leistungsfähig, wenn es um 
-Sie ist besonders leistungsstark, wenn es um multimodale Aufgaben geht. Lassen Sie uns also eine Runde drehen, um Bilder zu erzeugen und Text vorzulesen.
-
-```py
-agent.run("Caption the following image", image=image)
-```
-
-| **Input**                                                                                                                   | **Output**                        |
-|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
-| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
-
---
-
-```py
-agent.run("Read the following text out loud", text=text)
-```
-| **Input**                                                                                                               | **Output**                                   |
-|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
-| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
-
---
-
-```py
-agent.run(
-    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
-    document=document,
-)
-```
-| **Input**                                                                                                                   | **Output**     |
-|-----------------------------------------------------------------------------------------------------------------------------|----------------|
-| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
-
-## Schnellstart
-
-Bevor Sie `agent.run` verwenden können, müssen Sie einen Agenten instanziieren, der ein großes Sprachmodell (LLM) ist. 
-Wir bieten Unterstützung für openAI-Modelle sowie für OpenSource-Alternativen von BigCode und OpenAssistant. Die openAI
-Modelle sind leistungsfähiger (erfordern aber einen openAI-API-Schlüssel, können also nicht kostenlos verwendet werden); Hugging Face
-bietet kostenlosen Zugang zu Endpunkten für BigCode- und OpenAssistant-Modelle.
-
-To start with, please install the `agents` extras in order to install all default dependencies.
-```bash
-pip install transformers[agents]
-```
-
-Um openAI-Modelle zu verwenden, instanziieren Sie einen [`OpenAiAgent`], nachdem Sie die `openai`-Abhängigkeit installiert haben:
-
-```bash
-pip install openai
-```
-
-
-```py
-from transformers import OpenAiAgent
-
-agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
-```
-
-Um BigCode oder OpenAssistant zu verwenden, melden Sie sich zunächst an, um Zugriff auf die Inference API zu erhalten:
-
-```py
-from huggingface_hub import login
-
-login("<YOUR_TOKEN>")
-```
-
-Dann instanziieren Sie den Agenten
-
-```py
-from transformers import HfAgent
-
-# Starcoder
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-# StarcoderBase
-# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
-# OpenAssistant
-# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
-```
-
-Dies geschieht mit der Inferenz-API, die Hugging Face derzeit kostenlos zur Verfügung stellt. Wenn Sie Ihren eigenen Inferenz
-Endpunkt für dieses Modell (oder einen anderen) haben, können Sie die obige URL durch Ihren URL-Endpunkt ersetzen.
-
-<Tip>
-
-StarCoder und OpenAssistant sind kostenlos und leisten bei einfachen Aufgaben bewundernswert gute Arbeit. Allerdings halten die Kontrollpunkte
-nicht, wenn es um komplexere Aufforderungen geht. Wenn Sie mit einem solchen Problem konfrontiert sind, empfehlen wir Ihnen, das OpenAI
-Modell auszuprobieren, das zwar leider nicht quelloffen ist, aber zur Zeit eine bessere Leistung erbringt.
-
-</Tip>
-
-Sie sind jetzt startklar! Lassen Sie uns in die beiden APIs eintauchen, die Ihnen jetzt zur Verfügung stehen.
-
-### Einzelne Ausführung (run)
-
-Die Methode der einmaligen Ausführung ist die Verwendung der [`~Agent.run`] Methode des Agenten:
-
-```py
-agent.run("Draw me a picture of rivers and lakes.")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
-
-Es wählt automatisch das (oder die) Werkzeug(e) aus, das (die) für die von Ihnen gewünschte Aufgabe geeignet ist (sind) und führt es (sie) entsprechend aus. Es
-kann eine oder mehrere Aufgaben in der gleichen Anweisung ausführen (je komplexer Ihre Anweisung ist, desto wahrscheinlicher ist ein
-der Agent scheitern).
-
-```py
-agent.run("Draw me a picture of the sea then transform the picture to add an island")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
-
-<br/>
-
-
-Jede [`~Agent.run`] Operation ist unabhängig, so dass Sie sie mehrmals hintereinander mit unterschiedlichen Aufgaben ausführen können.
-
-Beachten Sie, dass Ihr `Agent` nur ein großsprachiges Modell ist, so dass kleine Variationen in Ihrer Eingabeaufforderung völlig unterschiedliche Ergebnisse liefern können.
-unterschiedliche Ergebnisse liefern. Es ist wichtig, dass Sie die Aufgabe, die Sie ausführen möchten, so genau wie möglich erklären. Wir gehen noch weiter ins Detail
-wie man gute Prompts schreibt [hier](custom_tools#writing-good-user-inputs).
-
-Wenn Sie einen Status über Ausführungszeiten hinweg beibehalten oder dem Agenten Nicht-Text-Objekte übergeben möchten, können Sie dies tun, indem Sie
-Variablen, die der Agent verwenden soll. Sie könnten zum Beispiel das erste Bild von Flüssen und Seen erzeugen, 
-und das Modell bitten, dieses Bild zu aktualisieren und eine Insel hinzuzufügen, indem Sie Folgendes tun:
-
-```python
-picture = agent.run("Generate a picture of rivers and lakes.")
-updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
-```
-
-<Tip>
-
-Dies kann hilfreich sein, wenn das Modell Ihre Anfrage nicht verstehen kann und die Werkzeuge verwechselt. Ein Beispiel wäre:
-
-```py
-agent.run("Draw me the picture of a capybara swimming in the sea")
-```
-
-Hier könnte das Modell auf zwei Arten interpretieren:
- Die Funktion `Text-zu-Bild` erzeugt ein Wasserschwein, das im Meer schwimmt.
- Oder Sie lassen das `Text-zu-Bild` ein Wasserschwein erzeugen und verwenden dann das Werkzeug `Bildtransformation`, um es im Meer schwimmen zu lassen.
-
-Falls Sie das erste Szenario erzwingen möchten, können Sie dies tun, indem Sie die Eingabeaufforderung als Argument übergeben:
-
-```py
-agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
-```
-
-</Tip>
-
-
-### Chat-basierte Ausführung (Chat)
-
-Der Agent verfügt auch über einen Chat-basierten Ansatz, der die Methode [`~Agent.chat`] verwendet:
-
-```py
-agent.chat("Generate a picture of rivers and lakes")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-```py
-agent.chat("Transform the picture so that there is a rock in there")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
-
-<br/>
-
-Dies ist ein interessanter Ansatz, wenn Sie den Zustand über Anweisungen hinweg beibehalten möchten. Er ist besser für Experimente geeignet, 
-eignet sich aber eher für einzelne Anweisungen als für komplexe Anweisungen (die die [`~Agent.run`]
-Methode besser verarbeiten kann).
-
-Diese Methode kann auch Argumente entgegennehmen, wenn Sie Nicht-Text-Typen oder bestimmte Aufforderungen übergeben möchten.
-
-### ⚠️ Fernausführung
-
-Zu Demonstrationszwecken und damit es mit allen Setups verwendet werden kann, haben wir Remote-Executors für mehrere 
-der Standard-Tools erstellt, auf die der Agent in dieser Version Zugriff hat. Diese werden erstellt mit 
-[inference endpoints](https://huggingface.co/inference-endpoints).
-
-Wir haben diese vorerst deaktiviert, aber um zu sehen, wie Sie selbst Remote Executors Tools einrichten können,
-empfehlen wir die Lektüre des [custom tool guide](./custom_tools).
-
-### Was passiert hier? Was sind Tools und was sind Agenten?
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
-
-#### Agenten
-
-Der "Agent" ist hier ein großes Sprachmodell, das wir auffordern, Zugang zu einem bestimmten Satz von Tools zu erhalten.
-
-LLMs sind ziemlich gut darin, kleine Codeproben zu erzeugen. Diese API macht sich das zunutze, indem sie das 
-LLM ein kleines Codebeispiel gibt, das eine Aufgabe mit einer Reihe von Werkzeugen ausführt. Diese Aufforderung wird dann ergänzt durch die 
-Aufgabe, die Sie Ihrem Agenten geben, und die Beschreibung der Werkzeuge, die Sie ihm geben. Auf diese Weise erhält er Zugriff auf die Dokumentation der 
-Tools, insbesondere die erwarteten Eingaben und Ausgaben, und kann den entsprechenden Code generieren.
-
-#### Tools
-
-Tools sind sehr einfach: Sie bestehen aus einer einzigen Funktion mit einem Namen und einer Beschreibung. Wir verwenden dann die Beschreibungen dieser Tools 
-um den Agenten aufzufordern. Anhand der Eingabeaufforderung zeigen wir dem Agenten, wie er die Tools nutzen kann, um das zu tun, was in der 
-in der Abfrage angefordert wurde.
-
-Dies geschieht mit brandneuen Tools und nicht mit Pipelines, denn der Agent schreibt besseren Code mit sehr atomaren Tools. 
-Pipelines sind stärker refaktorisiert und fassen oft mehrere Aufgaben in einer einzigen zusammen. Tools sind dafür gedacht, sich auf
-eine einzige, sehr einfache Aufgabe konzentrieren.
-
-#### Code-Ausführung?!
-
-Dieser Code wird dann mit unserem kleinen Python-Interpreter auf den mit Ihren Tools übergebenen Eingaben ausgeführt. 
-Wir hören Sie schon schreien "Willkürliche Codeausführung!", aber lassen Sie uns erklären, warum das nicht der Fall ist.
-
-Die einzigen Funktionen, die aufgerufen werden können, sind die von Ihnen zur Verfügung gestellten Tools und die Druckfunktion, so dass Sie bereits eingeschränkt sind 
-eingeschränkt, was ausgeführt werden kann. Sie sollten sicher sein, wenn es sich auf die Werkzeuge für das Umarmungsgesicht beschränkt. 
-
-Dann lassen wir keine Attributsuche oder Importe zu (die ohnehin nicht benötigt werden, um die 
-Inputs/Outputs an eine kleine Gruppe von Funktionen), so dass alle offensichtlichen Angriffe (und Sie müssten den LLM 
-dazu auffordern, sie auszugeben) kein Problem darstellen sollten. Wenn Sie auf Nummer sicher gehen wollen, können Sie die 
-run()-Methode mit dem zusätzlichen Argument return_code=True ausführen. In diesem Fall gibt der Agent nur den auszuführenden Code 
-zur Ausführung zurück und Sie können entscheiden, ob Sie ihn ausführen möchten oder nicht.
-
-Die Ausführung bricht bei jeder Zeile ab, in der versucht wird, eine illegale Operation auszuführen, oder wenn ein regulärer Python-Fehler 
-mit dem vom Agenten generierten Code.
-
-### Ein kuratierter Satz von Tools
-
-Wir haben eine Reihe von Tools identifiziert, die solche Agenten unterstützen können. Hier ist eine aktualisierte Liste der Tools, die wir integriert haben 
-in `transformers` integriert haben:
-
- **Beantwortung von Fragen zu Dokumenten**: Beantworten Sie anhand eines Dokuments (z.B. PDF) im Bildformat eine Frage zu diesem Dokument ([Donut](./model_doc/donut))
- Beantworten von Textfragen**: Geben Sie einen langen Text und eine Frage an, beantworten Sie die Frage im Text ([Flan-T5](./model_doc/flan-t5))
- **Unbedingte Bildunterschriften**: Beschriften Sie das Bild! ([BLIP](./model_doc/blip))
- **Bildfragebeantwortung**: Beantworten Sie bei einem Bild eine Frage zu diesem Bild ([VILT](./model_doc/vilt))
- **Bildsegmentierung**: Geben Sie ein Bild und einen Prompt an und geben Sie die Segmentierungsmaske dieses Prompts aus ([CLIPSeg](./model_doc/clipseg))
- **Sprache in Text**: Geben Sie eine Audioaufnahme einer sprechenden Person an und transkribieren Sie die Sprache in Text ([Whisper](./model_doc/whisper))
- **Text in Sprache**: wandelt Text in Sprache um ([SpeechT5](./model_doc/speecht5))
- **Zero-Shot-Textklassifizierung**: Ermitteln Sie anhand eines Textes und einer Liste von Bezeichnungen, welcher Bezeichnung der Text am ehesten entspricht ([BART](./model_doc/bart))
- **Textzusammenfassung**: fassen Sie einen langen Text in einem oder wenigen Sätzen zusammen ([BART](./model_doc/bart))
- **Übersetzung**: Übersetzen des Textes in eine bestimmte Sprache ([NLLB](./model_doc/nllb))
-
-Diese Tools sind in Transformatoren integriert und können auch manuell verwendet werden, zum Beispiel:
-
-```py
-from transformers import load_tool
-
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
-```
-
-### Benutzerdefinierte Tools
-
-Wir haben zwar eine Reihe von Tools identifiziert, sind aber der festen Überzeugung, dass der Hauptwert dieser Implementierung darin besteht 
-die Möglichkeit, benutzerdefinierte Tools schnell zu erstellen und weiterzugeben.
-
-Indem Sie den Code eines Tools in einen Hugging Face Space oder ein Modell-Repository stellen, können Sie das Tool 
-direkt mit dem Agenten nutzen. Wir haben ein paar neue Funktionen hinzugefügt 
-**transformers-agnostic** Tools zur [`huggingface-tools` Organisation](https://huggingface.co/huggingface-tools) hinzugefügt:
-
- **Text-Downloader**: zum Herunterladen eines Textes von einer Web-URL
- **Text zu Bild**: erzeugt ein Bild nach einer Eingabeaufforderung und nutzt dabei stabile Diffusion
- **Bildtransformation**: verändert ein Bild anhand eines Ausgangsbildes und einer Eingabeaufforderung, unter Ausnutzung der stabilen pix2pix-Diffusion
- **Text zu Video**: Erzeugen eines kleinen Videos nach einer Eingabeaufforderung, unter Verwendung von damo-vilab
-
-Das Text-zu-Bild-Tool, das wir von Anfang an verwendet haben, ist ein Remote-Tool, das sich in 
-[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! Wir werden
-weiterhin solche Tools für diese und andere Organisationen veröffentlichen, um diese Implementierung weiter zu verbessern.
-
-Die Agenten haben standardmäßig Zugriff auf die Tools, die sich auf [*huggingface-tools*](https://huggingface.co/huggingface-tools) befinden.
-Wie Sie Ihre eigenen Tools schreiben und freigeben können und wie Sie jedes benutzerdefinierte Tool, das sich auf dem Hub befindet, nutzen können, erklären wir in [folgender Anleitung](custom_tools).
-
-### Code-Erzeugung
-
-Bisher haben wir gezeigt, wie Sie die Agenten nutzen können, um Aktionen für Sie durchzuführen. Der Agent generiert jedoch nur Code
-den wir dann mit einem sehr eingeschränkten Python-Interpreter ausführen. Falls Sie den generierten Code in einer anderen Umgebung verwenden möchten 
-einer anderen Umgebung verwenden möchten, können Sie den Agenten auffordern, den Code zusammen mit einer Tooldefinition und genauen Importen zurückzugeben.
-
-Zum Beispiel die folgende Anweisung
-```python
-agent.run("Draw me a picture of rivers and lakes", return_code=True)
-```
-
-gibt den folgenden Code zurück
-
-```python
-from transformers import load_tool
-
-image_generator = load_tool("huggingface-tools/text-to-image")
-
-image = image_generator(prompt="rivers and lakes")
-```
-
-die Sie dann selbst ändern und ausführen können.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -1,294 +1,325 @@
 - sections:
  - local: index
-    title: 🤗 Transformers
-  - local: quicktour
-    title: Quick tour
+    title: Transformers
  - local: installation
    title: Installation
-  - local: add_new_model
-    title: Adding a new model to `transformers`
+  - local: quicktour
+    title: Quickstart
  title: Get started
- sections:
-  - local: pipeline_tutorial
-    title: Run inference with pipelines
-  - local: autoclass_tutorial
-    title: Write portable code with AutoClass
-  - local: preprocessing
-    title: Preprocess data
-  - local: training
-    title: Fine-tune a pretrained model
-  - local: run_scripts
-    title: Train with a script
-  - local: accelerate
-    title: Set up distributed training with 🤗 Accelerate
-  - local: peft
-    title: Load and train adapters with 🤗 PEFT
-  - local: model_sharing
-    title: Share your model
-  - local: agents
-    title: Agents 101
-  - local: agents_advanced
-    title: Agents, supercharged - Multi-agents, External tools, and more
-  - local: llm_tutorial
-    title: Generation with LLMs
-  - local: conversations
-    title: Chatting with Transformers
-  title: Tutorials
- sections:
-  - isExpanded: false
-    sections:
-    - local: tasks/sequence_classification
-      title: Text classification
-    - local: tasks/token_classification
-      title: Token classification
-    - local: tasks/question_answering
-      title: Question answering
-    - local: tasks/language_modeling
-      title: Causal language modeling
-    - local: tasks/masked_language_modeling
-      title: Masked language modeling
-    - local: tasks/translation
-      title: Translation
-    - local: tasks/summarization
-      title: Summarization
-    - local: tasks/multiple_choice
-      title: Multiple choice
-    title: Natural Language Processing
-  - isExpanded: false
-    sections:
-    - local: tasks/audio_classification
-      title: Audio classification
-    - local: tasks/asr
-      title: Automatic speech recognition
-    title: Audio
-  - isExpanded: false
-    sections:
-    - local: tasks/image_classification
-      title: Image classification
-    - local: tasks/semantic_segmentation
-      title: Image segmentation
-    - local: tasks/video_classification
-      title: Video classification
-    - local: tasks/object_detection
-      title: Object detection
-    - local: tasks/zero_shot_object_detection
-      title: Zero-shot object detection
-    - local: tasks/zero_shot_image_classification
-      title: Zero-shot image classification
-    - local: tasks/monocular_depth_estimation
-      title: Depth estimation
-    - local: tasks/image_to_image
-      title: Image-to-Image
-    - local: tasks/image_feature_extraction
-      title: Image Feature Extraction
-    - local: tasks/mask_generation
-      title: Mask Generation
-    - local: tasks/keypoint_detection
-      title: Keypoint Detection
-    - local: tasks/knowledge_distillation_for_image_classification
-      title: Knowledge Distillation for Computer Vision
-    title: Computer Vision
-  - isExpanded: false
-    sections:
-    - local: tasks/image_captioning
-      title: Image captioning
-    - local: tasks/document_question_answering
-      title: Document Question Answering
-    - local: tasks/visual_question_answering
-      title: Visual Question Answering
-    - local: tasks/text-to-speech
-      title: Text to speech
-    - local: tasks/image_text_to_text
-      title: Image-text-to-text
-    - local: tasks/video_text_to_text
-      title: Video-text-to-text
-    title: Multimodal
-  - isExpanded: false
-    sections:
+- isExpanded: false
+  sections:
+  - sections:
+    - local: models
+      title: Loading models
+    - local: custom_models
+      title: Customizing models
+    - local: how_to_hack_models
+      title: Customizing model components
+    - local: model_sharing
+      title: Sharing
+    - local: add_new_model
+      title: Adding a new model to Transformers
+    - local: modular_transformers
+      title: Modular Transformers
+    - local: auto_docstring
+      title: Document your models
+    - local: task_summary
+      title: What 🤗 Transformers can do
+    - local: tasks_explained
+      title: How 🤗 Transformers solve tasks
+    - local: model_summary
+      title: The Transformer model family
+    - local: attention
+      title: Attention mechanisms
+    - local: attention_interface
+      title: Customizing attention function
+    title: Models
+  - sections:
+    - local: fast_tokenizers
+      title: Tokenizers
+    - local: image_processors
+      title: Image processors
+    - local: video_processors
+      title: Video processors
+    - local: backbones
+      title: Backbones
+    - local: feature_extractors
+      title: Feature extractors
+    - local: processors
+      title: Processors
+    - local: tokenizer_summary
+      title: Summary of the tokenizers
+    - local: pad_truncation
+      title: Padding and truncation
+    title: Preprocessors
+  title: Base classes
+- isExpanded: false
+  sections:
+  - sections:
+    - local: pipeline_tutorial
+      title: Pipeline
+    - local: pipeline_gradio
+      title: Machine learning apps
+    - local: pipeline_webserver
+      title: Web server inference
+    - local: add_new_pipeline
+      title: Adding a new pipeline
+    title: Pipeline API
+  - sections:
+    - local: llm_tutorial
+      title: Text generation
    - local: generation_strategies
-      title: Customize the generation strategy
-    - local: kv_cache
-      title: Best Practices for Generation with Cache
-    title: Generation
-  - isExpanded: false
-    sections:
-    - local: chat_template_basics
-      title: Getting Started with Chat Templates for Text LLMs
-    - local: chat_template_multimodal
-      title: Multimodal Chat Templates for Vision and Audio LLMs
-    - local: chat_template_tools_and_documents
-      title: Expanding Chat Templates with Tools and Documents
-    - local: chat_template_advanced
-      title: Advanced Usage and Customizing Your Chat Templates
-    title: Chat Templates
-  - isExpanded: false
-    sections:
-    - local: tasks/idefics
-      title: Image tasks with IDEFICS
+      title: Generation strategies
+    - local: generation_features
+      title: Generation features
    - local: tasks/prompting
-      title: LLM prompting guide
-    title: Prompting
-  title: Task Guides
- sections:
-  - local: fast_tokenizers
-    title: Use fast tokenizers from 🤗 Tokenizers
-  - local: multilingual
-    title: Run inference with multilingual models
-  - local: create_a_model
-    title: Use model-specific APIs
-  - local: custom_models
-    title: Share a custom model
-  - local: trainer
-    title: Trainer
-  - local: sagemaker
-    title: Run training on Amazon SageMaker
+      title: Prompt engineering
+    - local: llm_optims
+      title: Optimizing inference
+    - local: kv_cache
+      title: KV cache strategies
+    - local: serving
+      title: Serving
+    - local: cache_explanation
+      title: Caching
+    - local: llm_tutorial_optimization
+      title: Getting the most out of LLMs
+    - local: perplexity
+      title: Perplexity of fixed-length models
+    title: LLMs
+  - sections:
+    - local: conversations
+      title: Chat basics
+    - local: chat_templating
+      title: Templates
+    - local: chat_templating_multimodal
+      title: Multimodal templates
+    - local: chat_templating_writing
+      title: Template writing
+    - local: chat_extras
+      title: Tools and RAG
+    title: Chat with models
+  - sections:
+    - local: perf_torch_compile
+      title: torch.compile
+    - local: perf_infer_gpu_one
+      title: GPU
+    - local: perf_infer_gpu_multi
+      title: Distributed GPU inference
+    - local: perf_infer_cpu
+      title: CPU
+    - local: tf_xla
+      title: XLA
+    title: Optimization
+  - local: agents
+    title: Agents
+  - local: tools
+    title: Tools
+  title: Inference
+- isExpanded: false
+  sections:
+  - sections:
+    - local: trainer
+      title: Trainer
+    - local: training
+      title: Fine-tuning
+    - local: optimizers
+      title: Optimizers
+    - local: hpo_train
+      title: Hyperparameter search
+    title: Trainer API
+  - sections:
+    - local: gpu_selection
+      title: GPU selection
+    - local: accelerate
+      title: Accelerate
+    - local: fsdp
+      title: FullyShardedDataParallel
+    - local: deepspeed
+      title: DeepSpeed
+    - local: debugging
+      title: Multi-GPU debugging
+    - local: perf_train_cpu_many
+      title: Distributed CPUs
+    - local: perf_train_gpu_many
+      title: Parallelism methods
+    title: Distributed training
+  - sections:
+    - local: perf_train_gpu_one
+      title: GPU
+    - local: perf_train_cpu
+      title: CPU
+    - local: perf_train_tpu_tf
+      title: TPU
+    - local: perf_train_special
+      title: Apple Silicon
+    - local: perf_train_gaudi
+      title: Intel Gaudi
+    - local: perf_hardware
+      title: Build your own machine
+    title: Hardware
+  - local: peft
+    title: PEFT
+  - local: model_memory_anatomy
+    title: Model training anatomy
+  title: Training
+- isExpanded: false
+  sections:
+  - local: quantization/overview
+    title: Overview
+  - local: quantization/selecting
+    title: Selecting a quantization method
+  - local: quantization/concept_guide
+    title: Quantization concepts
+  - local: quantization/aqlm
+    title: AQLM
+  - local: quantization/auto_round
+    title: AutoRound
+  - local: quantization/awq
+    title: AWQ
+  - local: quantization/bitnet
+    title: BitNet
+  - local: quantization/bitsandbytes
+    title: bitsandbytes
+  - local: quantization/compressed_tensors
+    title: compressed-tensors
+  - local: quantization/eetq
+    title: EETQ
+  - local: quantization/fbgemm_fp8
+    title: FBGEMM
+  - local: quantization/finegrained_fp8
+    title: Fine-grained FP8
+  - local: gguf
+    title: GGUF
+  - local: quantization/gptq
+    title: GPTQ
+  - local: quantization/higgs
+    title: HIGGS
+  - local: quantization/hqq
+    title: HQQ
+  - local: quantization/optimum
+    title: Optimum
+  - local: quantization/quanto
+    title: Quanto
+  - local: quantization/quark
+    title: Quark
+  - local: quantization/torchao
+    title: torchao
+  - local: quantization/spqr
+    title: SpQR
+  - local: quantization/vptq
+    title: VPTQ
+  - local: quantization/contribute
+    title: Contribute
+  title: Quantization
+- isExpanded: false
+  sections:
  - local: serialization
-    title: Export to ONNX
+    title: ONNX
  - local: tflite
-    title: Export to TFLite
+    title: LiteRT
+  - local: executorch
+    title: ExecuTorch
  - local: torchscript
-    title: Export to TorchScript
+    title: TorchScript
+  title: Export to production
+- isExpanded: false
+  sections:
+  - sections:
+    - sections:
+      - local: tasks/sequence_classification
+        title: Text classification
+      - local: tasks/token_classification
+        title: Token classification
+      - local: tasks/question_answering
+        title: Question answering
+      - local: tasks/language_modeling
+        title: Causal language modeling
+      - local: tasks/masked_language_modeling
+        title: Masked language modeling
+      - local: tasks/translation
+        title: Translation
+      - local: tasks/summarization
+        title: Summarization
+      - local: tasks/multiple_choice
+        title: Multiple choice
+      title: Natural language processing
+    - sections:
+      - local: tasks/audio_classification
+        title: Audio classification
+      - local: tasks/asr
+        title: Automatic speech recognition
+      title: Audio
+    - sections:
+      - local: tasks/image_classification
+        title: Image classification
+      - local: tasks/semantic_segmentation
+        title: Image segmentation
+      - local: tasks/video_classification
+        title: Video classification
+      - local: tasks/object_detection
+        title: Object detection
+      - local: tasks/zero_shot_object_detection
+        title: Zero-shot object detection
+      - local: tasks/zero_shot_image_classification
+        title: Zero-shot image classification
+      - local: tasks/monocular_depth_estimation
+        title: Depth estimation
+      - local: tasks/image_to_image
+        title: Image-to-Image
+      - local: tasks/image_feature_extraction
+        title: Image Feature Extraction
+      - local: tasks/mask_generation
+        title: Mask Generation
+      - local: tasks/keypoint_detection
+        title: Keypoint detection
+      - local: tasks/knowledge_distillation_for_image_classification
+        title: Knowledge Distillation for Computer Vision
+      title: Computer vision
+    - sections:
+      - local: tasks/image_captioning
+        title: Image captioning
+      - local: tasks/document_question_answering
+        title: Document Question Answering
+      - local: tasks/visual_question_answering
+        title: Visual Question Answering
+      - local: tasks/text-to-speech
+        title: Text to speech
+      - local: tasks/idefics
+        title: Image tasks with IDEFICS
+      - local: tasks/image_text_to_text
+        title: Image-text-to-text
+      - local: tasks/video_text_to_text
+        title: Video-text-to-text
+      - local: tasks/visual_document_retrieval
+        title: Visual Document Retrieval
+      title: Multimodal
+    title: Task recipes
+  - local: run_scripts
+    title: Training scripts
+  - local: glossary
+    title: Glossary
+  - local: philosophy
+    title: Philosophy
  - local: notebooks
    title: Notebooks with examples
  - local: community
    title: Community resources
  - local: troubleshooting
    title: Troubleshoot
-  - local: gguf
-    title: Interoperability with GGUF files
-  - local: tiktoken
-    title: Interoperability with TikToken files
-  - local: modular_transformers
-    title: Modularity in `transformers`
-  - local: how_to_hack_models
-    title: Model Hacking (overwriting a class to your usage)
-  title: Developer guides
- sections:
-  - local: quantization/overview
-    title: Getting started
-  - local: quantization/bitsandbytes
-    title: bitsandbytes
-  - local: quantization/gptq
-    title: GPTQ
-  - local: quantization/awq
-    title: AWQ
-  - local: quantization/aqlm
-    title: AQLM
-  - local: quantization/vptq
-    title: SpQR
-  - local: quantization/spqr
-    title: VPTQ
-  - local: quantization/quanto
-    title: Quanto
-  - local: quantization/eetq
-    title: EETQ
-  - local: quantization/higgs
-    title: HIGGS
-  - local: quantization/hqq
-    title: HQQ
-  - local: quantization/fbgemm_fp8
-    title: FBGEMM_FP8
-  - local: quantization/optimum
-    title: Optimum
-  - local: quantization/torchao
-    title: TorchAO
-  - local: quantization/bitnet
-    title: BitNet
-  - local: quantization/compressed_tensors
-    title: compressed-tensors
-  - local: quantization/finegrained_fp8
-    title: Fine-grained FP8
-  - local: quantization/contribute
-    title: Contribute new quantization method
-  title: Quantization Methods
- sections:
-  - local: performance
-    title: Overview
-  - local: llm_optims
-    title: LLM inference optimization
-  - sections:
-    - local: perf_train_gpu_one
-      title: Methods and tools for efficient training on a single GPU
-    - local: perf_train_gpu_many
-      title: Multiple GPUs and parallelism
-    - local: fsdp
-      title: Fully Sharded Data Parallel
-    - local: deepspeed
-      title: DeepSpeed
-    - local: perf_train_cpu
-      title: Efficient training on CPU
-    - local: perf_train_cpu_many
-      title: Distributed CPU training
-    - local: perf_train_tpu_tf
-      title: Training on TPU with TensorFlow
-    - local: perf_train_special
-      title: PyTorch training on Apple silicon
-    - local: perf_hardware
-      title: Custom hardware for training
-    - local: hpo_train
-      title: Hyperparameter Search using Trainer API
-    title: Efficient training techniques
-  - sections:
-    - local: perf_infer_cpu
-      title: CPU inference
-    - local: perf_infer_gpu_one
-      title: GPU inference
-    - local: perf_infer_gpu_multi
-      title: Multi-GPU inference
-    title: Optimizing inference
-  - local: big_models
-    title: Instantiate a big model
-  - local: debugging
-    title: Debugging
-  - local: tf_xla
-    title: XLA Integration for TensorFlow Models
-  - local: perf_torch_compile
-    title: Optimize inference using `torch.compile()`
-  title: Performance and scalability
- sections:
+  title: Resources
+- isExpanded: false
+  sections:
  - local: contributing
-    title: How to contribute to 🤗 Transformers?
-  - local: add_new_model
-    title: How to add a model to 🤗 Transformers?
-  - local: add_new_pipeline
-    title: How to add a pipeline to 🤗 Transformers?
+    title: Contribute to Transformers
  - local: testing
-    title: Testing
+    title: Transformers model tests
  - local: pr_checks
-    title: Checks on a Pull Request
+    title: Pull request checks
  title: Contribute
- sections:
-  - local: philosophy
-    title: Philosophy
-  - local: glossary
-    title: Glossary
-  - local: task_summary
-    title: What 🤗 Transformers can do
-  - local: tasks_explained
-    title: How 🤗 Transformers solve tasks
-  - local: model_summary
-    title: The Transformer model family
-  - local: tokenizer_summary
-    title: Summary of the tokenizers
-  - local: attention
-    title: Attention mechanisms
-  - local: pad_truncation
-    title: Padding and truncation
-  - local: bertology
-    title: BERTology
-  - local: perplexity
-    title: Perplexity of fixed-length models
-  - local: pipeline_webserver
-    title: Pipelines for webserver inference
-  - local: model_memory_anatomy
-    title: Model training anatomy
-  - local: llm_tutorial_optimization
-    title: Getting the most out of LLMs
-  title: Conceptual guides
- sections:
+- isExpanded: false
+  sections:
  - sections:
-    - local: main_classes/agent
-      title: Agents and Tools
    - local: model_doc/auto
      title: Auto Classes
    - local: main_classes/backbones
@ -313,6 +344,8 @@
      title: Optimization
    - local: main_classes/output
      title: Model outputs
+    - local: main_classes/peft
+      title: PEFT
    - local: main_classes/pipelines
      title: Pipelines
    - local: main_classes/processors
@ -331,10 +364,11 @@
      title: Feature Extractor
    - local: main_classes/image_processor
      title: Image Processor
+    - local: main_classes/video_processor
+      title: Video Processor
    title: Main Classes
  - sections:
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/albert
        title: ALBERT
      - local: model_doc/bamba
@ -359,6 +393,8 @@
        title: BigBirdPegasus
      - local: model_doc/biogpt
        title: BioGpt
+      - local: model_doc/bitnet
+        title: BitNet
      - local: model_doc/blenderbot
        title: Blenderbot
      - local: model_doc/blenderbot-small
@ -395,6 +431,8 @@
        title: DeBERTa
      - local: model_doc/deberta-v2
        title: DeBERTa-v2
+      - local: model_doc/deepseek_v3
+        title: DeepSeek-V3
      - local: model_doc/dialogpt
        title: DialoGPT
      - local: model_doc/diffllama
@ -439,6 +477,8 @@
        title: Gemma2
      - local: model_doc/glm
        title: GLM
+      - local: model_doc/glm4
+        title: glm4
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
@ -461,14 +501,16 @@
        title: Granite
      - local: model_doc/granitemoe
        title: GraniteMoe
+      - local: model_doc/granitemoehybrid
+        title: GraniteMoeHybrid
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
-      - local: model_doc/granitevision
-        title: GraniteVision
      - local: model_doc/helium
        title: Helium
      - local: model_doc/herbert
        title: HerBERT
+      - local: model_doc/hgnet_v2
+        title: HGNet-V2
      - local: model_doc/ibert
        title: I-BERT
      - local: model_doc/jamba
@ -577,6 +619,10 @@
        title: Qwen2
      - local: model_doc/qwen2_moe
        title: Qwen2MoE
+      - local: model_doc/qwen3
+        title: Qwen3
+      - local: model_doc/qwen3_moe
+        title: Qwen3MoE
      - local: model_doc/rag
        title: RAG
      - local: model_doc/realm
@ -644,8 +690,7 @@
      - local: model_doc/zamba2
        title: Zamba2
      title: Text models
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/beit
        title: BEiT
      - local: model_doc/bit
@ -658,6 +703,8 @@
        title: ConvNeXTV2
      - local: model_doc/cvt
        title: CvT
+      - local: model_doc/d_fine
+        title: D-FINE
      - local: model_doc/dab-detr
        title: DAB-DETR
      - local: model_doc/deformable_detr
@ -704,6 +751,8 @@
        title: Mask2Former
      - local: model_doc/maskformer
        title: MaskFormer
+      - local: model_doc/mlcd
+        title: MLCD
      - local: model_doc/mobilenet_v1
        title: MobileNetV1
      - local: model_doc/mobilenet_v2
@ -716,6 +765,8 @@
        title: NAT
      - local: model_doc/poolformer
        title: PoolFormer
+      - local: model_doc/prompt_depth_anything
+        title: Prompt Depth Anything
      - local: model_doc/pvt
        title: Pyramid Vision Transformer (PVT)
      - local: model_doc/pvt_v2
@ -773,20 +824,23 @@
      - local: model_doc/zoedepth
        title: ZoeDepth
      title: Vision models
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/audio-spectrogram-transformer
        title: Audio Spectrogram Transformer
      - local: model_doc/bark
        title: Bark
      - local: model_doc/clap
        title: CLAP
+      - local: model_doc/csm
+        title: CSM
      - local: model_doc/dac
        title: dac
      - local: model_doc/encodec
        title: EnCodec
      - local: model_doc/fastspeech2_conformer
        title: FastSpeech2Conformer
+      - local: model_doc/granite_speech
+        title: GraniteSpeech
      - local: model_doc/hubert
        title: Hubert
      - local: model_doc/mctct
@ -844,8 +898,7 @@
      - local: model_doc/xlsr_wav2vec2
        title: XLSR-Wav2Vec2
      title: Audio models
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/timesformer
        title: TimeSformer
      - local: model_doc/videomae
@ -853,14 +906,15 @@
      - local: model_doc/vivit
        title: ViViT
      title: Video models
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/align
        title: ALIGN
      - local: model_doc/altclip
        title: AltCLIP
      - local: model_doc/aria
        title: Aria
+      - local: model_doc/aya_vision
+        title: AyaVision
      - local: model_doc/blip
        title: BLIP
      - local: model_doc/blip-2
@ -891,10 +945,14 @@
        title: Emu3
      - local: model_doc/flava
        title: FLAVA
+      - local: model_doc/gemma3
+        title: Gemma3
      - local: model_doc/git
        title: GIT
      - local: model_doc/got_ocr2
        title: GOT-OCR2
+      - local: model_doc/granitevision
+        title: GraniteVision
      - local: model_doc/grounding-dino
        title: Grounding DINO
      - local: model_doc/groupvit
@ -909,6 +967,10 @@
        title: InstructBLIP
      - local: model_doc/instructblipvideo
        title: InstructBlipVideo
+      - local: model_doc/internvl
+        title: InternVL
+      - local: model_doc/janus
+        title: Janus
      - local: model_doc/kosmos-2
        title: KOSMOS-2
      - local: model_doc/layoutlm
@ -921,6 +983,8 @@
        title: LayoutXLM
      - local: model_doc/lilt
        title: LiLT
+      - local: model_doc/llama4
+        title: Llama4
      - local: model_doc/llava
        title: Llava
      - local: model_doc/llava_next
@ -935,6 +999,8 @@
        title: MatCha
      - local: model_doc/mgp-str
        title: MGP-STR
+      - local: model_doc/mistral3
+        title: Mistral3
      - local: model_doc/mllama
        title: mllama
      - local: model_doc/nougat
@ -951,10 +1017,14 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
+      - local: model_doc/phi4_multimodal
+        title: Phi4 Multimodal
      - local: model_doc/pix2struct
        title: Pix2Struct
      - local: model_doc/pixtral
        title: Pixtral
+      - local: model_doc/qwen2_5_omni
+        title: Qwen2.5-Omni
      - local: model_doc/qwen2_5_vl
        title: Qwen2.5-VL
      - local: model_doc/qwen2_audio
@ -963,6 +1033,10 @@
        title: Qwen2VL
      - local: model_doc/sam
        title: Segment Anything
+      - local: model_doc/sam_hq
+        title: Segment Anything High Quality
+      - local: model_doc/shieldgemma2
+        title: ShieldGemma2
      - local: model_doc/siglip
        title: SigLIP
      - local: model_doc/siglip2
@ -996,15 +1070,13 @@
      - local: model_doc/xclip
        title: X-CLIP
      title: Multimodal models
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/decision_transformer
        title: Decision Transformer
      - local: model_doc/trajectory_transformer
        title: Trajectory Transformer
      title: Reinforcement learning models
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/autoformer
        title: Autoformer
      - local: model_doc/informer
@ -1015,9 +1087,10 @@
        title: PatchTST
      - local: model_doc/time_series_transformer
        title: Time Series Transformer
+      - local: model_doc/timesfm
+        title: TimesFM
      title: Time series models
-    - isExpanded: false
-      sections:
+    - sections:
      - local: model_doc/graphormer
        title: Graphormer
      title: Graph models
@ -1025,6 +1098,8 @@
  - sections:
    - local: internal/modeling_utils
      title: Custom Layers and Utilities
+    - local: internal/model_debugging_utils
+      title: Utilities for Model Debugging
    - local: internal/pipelines_utils
      title: Utilities for pipelines
    - local: internal/tokenization_utils
@ -1039,7 +1114,9 @@
      title: Utilities for Audio processing
    - local: internal/file_utils
      title: General Utilities
+    - local: internal/import_utils
+      title: Importing Utilities
    - local: internal/time_series_utils
      title: Utilities for Time Series
-    title: Internal Helpers
+    title: Internal helpers
  title: API
--- a/docs/source/en/accelerate.md
+++ b/docs/source/en/accelerate.md
@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,123 +14,152 @@ rendered properly in your Markdown viewer.

 -->

-# Distributed training with 🤗 Accelerate
+# Accelerate

-As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
+[Accelerate](https://hf.co/docs/accelerate/index) is a library designed to simplify distributed training on any type of setup with PyTorch by uniting the most common frameworks ([Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/)) for it into a single interface. [`Trainer`] is powered by Accelerate under the hood, enabling loading big models and distributed training.

-## Setup
-
-Get started by installing 🤗 Accelerate:
+This guide will show you two ways to use Accelerate with Transformers, using FSDP as the backend. The first method demonstrates distributed training with [`Trainer`], and the second method demonstrates adapting a PyTorch training loop. For more detailed information about Accelerate, please refer to the [documentation](https://hf.co/docs/accelerate/index).

 ```bash
 pip install accelerate
 ```

-Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
-
-```py
->>> from accelerate import Accelerator
-
->>> accelerator = Accelerator()
-```
-
-## Prepare to accelerate
-
-The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer:
-
-```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-...     train_dataloader, eval_dataloader, model, optimizer
-... )
-```
-
-## Backward
-
-The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`] method:
-
-```py
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         accelerator.backward(loss)
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training!
-
-```diff
-+ from accelerate import Accelerator
-  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
-
-+ accelerator = Accelerator()
-
-  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
-  optimizer = AdamW(model.parameters(), lr=3e-5)
-
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
- model.to(device)
-
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+     train_dataloader, eval_dataloader, model, optimizer
-+ )
-
-  num_epochs = 3
-  num_training_steps = num_epochs * len(train_dataloader)
-  lr_scheduler = get_scheduler(
-      "linear",
-      optimizer=optimizer,
-      num_warmup_steps=0,
-      num_training_steps=num_training_steps
-  )
-
-  progress_bar = tqdm(range(num_training_steps))
-
-  model.train()
-  for epoch in range(num_epochs):
-      for batch in train_dataloader:
-         batch = {k: v.to(device) for k, v in batch.items()}
-          outputs = model(**batch)
-          loss = outputs.loss
-         loss.backward()
-+         accelerator.backward(loss)
-
-          optimizer.step()
-          lr_scheduler.step()
-          optimizer.zero_grad()
-          progress_bar.update(1)
-```
-
-## Train
-
-Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory.
-
-### Train with a script
-
-If you are running your training from a script, run the following command to create and save a configuration file:
+Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-config) in the command line to answer a series of prompts about your training system. This creates and saves a configuration file to help Accelerate correctly set up training based on your setup.

 ```bash
 accelerate config
 ```

-Then launch your training with:
+Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs may look like the following.

-```bash
-accelerate launch train.py
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BertLayer
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
 ```

-### Train with a notebook
+## Trainer

-🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]:
+Pass the path to the saved configuration file to [`TrainingArguments`], and from there, pass your [`TrainingArguments`] to [`Trainer`].

 ```py
->>> from accelerate import notebook_launcher
+from transformers import TrainingArguments, Trainer

->>> notebook_launcher(training_function)
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    fsdp_config="path/to/fsdp_config",
+    fsdp_strategy="full_shard",
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    processing_class=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
 ```

-For more information about 🤗 Accelerate and its rich features, refer to the [documentation](https://huggingface.co/docs/accelerate).
+## Native PyTorch
+
+Accelerate can also be added to any PyTorch training loop to enable distributed training. The [`~accelerate.Accelerator`] is the main entry point for adapting your PyTorch code to work with Accelerate. It automatically detects your distributed training setup and initializes all the necessary components for training. You don't need to explicitly place your model on a device because [`~accelerate.Accelerator`] knows which device to move your model to.
+
+```py
+from accelerate import Accelerator
+
+accelerator = Accelerator()
+device = accelerator.device
+```
+
+All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.optimizer.AcceleratedOptimizer`] and [`~accelerate.scheduler.AcceleratedScheduler`], and creates a new shardable dataloader.
+
+```py
+train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+    train_dataloader, eval_dataloader, model, optimizer
+)
+```
+
+Replace `loss.backward` in your training loop with Accelerates [`~accelerate.Accelerator.backward`] method to scale the gradients and determine the appropriate `backward` method to use depending on your framework (for example, DeepSpeed or Megatron).
+
+```py
+for epoch in range(num_epochs):
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+        progress_bar.update(1)
+```
+
+Combine everything into a function and make it callable as a script.
+
+```py
+from accelerate import Accelerator
+  
+def main():
+  accelerator = Accelerator()
+
+  model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+      model, optimizer, training_dataloader, scheduler
+  )
+
+  for batch in training_dataloader:
+      optimizer.zero_grad()
+      inputs, targets = batch
+      outputs = model(inputs)
+      loss = loss_function(outputs, targets)
+      accelerator.backward(loss)
+      optimizer.step()
+      scheduler.step()
+
+if __name__ == "__main__":
+    main()
+```
+
+From the command line, call [accelerate launch](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-launch) to run your training script. Any additional arguments or parameters can be passed here as well.
+
+To launch your training script on two GPUs, add the `--num_processes` argument.
+
+```bash
+accelerate launch --num_processes=2 your_script.py
+```
+
+Refer to the [Launching Accelerate scripts](https://hf.co/docs/accelerate/main/en/basic_tutorials/launch) for more details.
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
--- a/docs/source/en/add_new_pipeline.md
+++ b/docs/source/en/add_new_pipeline.md
@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -13,92 +13,66 @@ rendered properly in your Markdown viewer.

 -->

-# How to create a custom pipeline?
+# Adding a new pipeline

-In this guide, we will see how to create a custom pipeline and share it on the [Hub](https://hf.co/models) or add it to the
-🤗 Transformers library.
+Make [`Pipeline`] your own by subclassing it and implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it.

-First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
-dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
-as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
-pipeline (`preprocess`).
+This guide will walk you through the process of adding a new pipeline to Transformers.

-Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
-`postprocess` method.
+## Design choices

-Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`,
-`_forward`, `postprocess`, and `_sanitize_parameters`.
+At a minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline.

+Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with.

-```python
+Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data.
+
+## Create a pipeline
+
+With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods.
+
+```py
 from transformers import Pipeline

-
 class MyPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "maybe_arg" in kwargs:
-            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-        return preprocess_kwargs, {}, {}

-    def preprocess(self, inputs, maybe_arg=2):
-        model_input = Tensor(inputs["input_ids"])
-        return {"model_input": model_input}
+    def preprocess(self, inputs, args=2):

    def _forward(self, model_inputs):
-        # model_inputs == {"model_input": model_input}
-        outputs = self.model(**model_inputs)
-        # Maybe {"logits": Tensor(...)}
-        return outputs

    def postprocess(self, model_outputs):
-        best_class = model_outputs["logits"].softmax(-1)
-        return best_class
 ```

-The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
-pre/postprocessing on the CPU on different threads
+1. `preprocess` takes the inputs and transforms them into the appropriate input format for the model.

-`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
-contain more information and is usually a `Dict`.
-
-`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
-called method as it contains safeguards to make sure everything is working on the expected device. If anything is
-linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
-
-`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided
-earlier.
-
-`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
-time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
-
-The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
-`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
-allows to keep the default arguments in the function definition which is always more "natural".
-
-A classic example would be a `top_k` argument in the post processing in classification tasks.
-
-```python
->>> pipe = pipeline("my-new-task")
->>> pipe("This is a test")
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
-{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
-
->>> pipe("This is a test", top_k=2)
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```py
+def preprocess(self, inputs, maybe_arg=2):
+    model_input = Tensor(inputs["input_ids"])
+    return {"model_input": model_input}
 ```

-In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
-`_sanitize_parameters` to allow this new parameter.
+2. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`.

+```py
+def _forward(self, model_inputs):
+    outputs = self.model(**model_inputs)
+    return outputs
+```

-```python
+3. `postprocess` generates the final output from the models output in `_forward`.
+
+```py
 def postprocess(self, model_outputs, top_k=5):
    best_class = model_outputs["logits"].softmax(-1)
-    # Add logic to handle top_k
    return best_class
+```

+4. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural.

+For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`.
+
+```py
 def _sanitize_parameters(self, **kwargs):
    preprocess_kwargs = {}
    if "maybe_arg" in kwargs:
@ -110,55 +84,61 @@ def _sanitize_parameters(self, **kwargs):
    return preprocess_kwargs, {}, postprocess_kwargs
 ```

-Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
-without requiring users to understand new kinds of objects. It's also relatively common to support many different types
-of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes)
+Now the pipeline can return the top most likely labels if a user chooses to.

+```py
+from transformers import pipeline

+pipeline = pipeline("my-task")
+# returns 3 most likely labels
+pipeline("This is the best meal I've ever had", top_k=3)
+# returns 5 most likely labels by default
+pipeline("This is the best meal I've ever had")
+```

-## Adding it to the list of supported tasks
+## Register a pipeline

-To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`:
+Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:

-```python
+- the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks)
+- a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
+- the expected input with `type`
+
+```py
 from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification

 PIPELINE_REGISTRY.register_pipeline(
    "new-task",
    pipeline_class=MyPipeline,
    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+    default={"pt": ("user/awesome-model", "branch-name")},
+    type="text",
 )
 ```

-You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type:
+## Share your pipeline

-```python
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    default={"pt": ("user/awesome_model", "abcdef")},
-    type="text",  # current support type: text, audio, image, multimodal
-)
-```
+Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers.

-## Share your pipeline on the Hub
+It's faster to upload your pipeline code to the Hub because it doesn't require a review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure your [`Pipeline`] works.

-To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a
-python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this:
+### Upload to the Hub
+
+Add your pipeline code to the Hub in a Python file.
+
+For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models.

 ```py
 import numpy as np
-
 from transformers import Pipeline

-
 def softmax(outputs):
    maxes = np.max(outputs, axis=-1, keepdims=True)
    shifted_exp = np.exp(outputs - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

-
 class PairClassificationPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
@ -183,8 +163,7 @@ class PairClassificationPipeline(Pipeline):
        return {"label": label, "score": score, "logits": logits}
 ```

-The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in
-a file named `pair_classification.py`, we can then import it and register it like this.
+Save the code in a file named `pair_classification.py`, and import and register it as shown below.

 ```py
 from pair_classification import PairClassificationPipeline
@ -215,56 +194,36 @@ The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5f
  },
 ```

-Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been
-fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not.
+Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace.

 ```py
 from transformers import pipeline

-classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+pipeline = pipeline(task="pair-classification", model="sgugger/finetuned-bert-mrpc")
+pipeline.push_to_hub("pair-classification-pipeline")
 ```

-Then we can share it on the Hub by using the `push_to_hub` method:
-
-```py
-classifier.push_to_hub("test-dynamic-pipeline")
-```
-
-This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`,
-along with saving the model and tokenizer of the pipeline, before pushing everything into the repository
-`{your_username}/test-dynamic-pipeline`. After that, anyone can use it as long as they provide the option
-`trust_remote_code=True`:
+To use the pipeline, add `trust_remote_code=True` when loading the pipeline.

 ```py
 from transformers import pipeline

-classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+pipeline = pipeline(task="pair-classification", trust_remote_code=True)
 ```

-## Add the pipeline to 🤗 Transformers
+### Add to Transformers

-If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule
-with the code of your pipeline, then add it to the list of tasks defined in `pipelines/__init__.py`.
+Adding a custom pipeline to Transformers requires adding tests to make sure everything works as expected, and requesting a review from the Transformers team.

-Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with examples of the other tests.
+Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py).

-The `run_pipeline_test` function will be very generic and run on small random models on every possible
-architecture as defined by `model_mapping` and `tf_model_mapping`.
+Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.

-This is very important to test future compatibility, meaning if someone adds a new model for
-`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
-impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the
-output of the pipeline TYPE.
+The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models.

-You also *need* to implement 2 (ideally 4) tests.
+You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.

- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases.
- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases.
+Finally, you should also implement the following 4 tests.
+
+1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
+1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
-# Agents and tools

 > [!WARNING]
-> This subpackage will soon be deprecated, since it has ben spun off into [smolagents](https://huggingface.co/docs/smolagents/index). Smolagents has extended functionality, and a similar API.
+> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -0,0 +1,128 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Attention Interface
+
+This page describes how to use the `AttentionInterface` in order to register custom attention functions to use with
+supported models.
+
+## Customizing attention function
+
+Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
+By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
+[`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
+as well as `eager`, which is a simple matrix multiplication without any optimization on top.  
+This is the setting you can usually choose when instantiating a model:
+
+```python
+from transformers import AutoModelForCausalLM
+
+model_id = "meta-llama/Llama-3.2-1B"
+
+# Here, using flash attention as an example
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
+```
+
+But what if you wanted to create your own attention function? Or simply play around with existing ones, adding
+a few statements here and there? You can now do so with the `AttentionInterface`! Here is an example:
+
+```python
+from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+import torch
+
+model_id = "meta-llama/Llama-3.2-1B"
+
+def my_new_sdpa(*args, **kwargs):
+    print("I just entered the attention computation")
+    return sdpa_attention_forward(*args, **kwargs)
+
+AttentionInterface.register("my_new_sdpa", my_new_sdpa)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_new_sdpa")
+# Try running the forward with the new attention function
+model(torch.ones(1, 5, dtype=int))
+```
+
+You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
+
+## Dynamically switching attention function
+
+You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
+
+```python
+# Back to use original sdpa implementation
+model.config._attn_implementation = "sdpa"
+
+model(torch.ones(1, 5, dtype=int))
+```
+
+and it will stop printing the statements, as it now uses the `sdpa` attention.  
+This allows to quickly change an attention function, without needing to reload the model!
+
+## What about new args needed in my custom attention function?
+
+But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
+`AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
+you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
+
+```python
+from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+import torch
+
+def custom_attention(
+    module: torch.nn.Module,  # required arg
+    query: torch.Tensor,  # required arg
+    key: torch.Tensor,  # required arg
+    value: torch.Tensor,  # required arg
+    attention_mask: Optional[torch.Tensor],  # required arg
+    a_new_kwargs = None,  # You can now add as many kwargs as you need
+    another_new_kwargs = None,  # You can now add as many kwargs as you need
+    **kwargs,  # You need to accept **kwargs as models will pass other args
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
+    ...  # do your magic!
+    return attn_output, attn_weights  # attn_weights are optional here
+
+AttentionInterface.register("custom", custom_attention)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom")
+# Forward pass with the new kwargs
+model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
+```
+
+If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
+
+## Accessing current available implementations
+
+Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
+and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+would expect from a usual Python dictionary:
+
+```python
+>>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+>>> list(ALL_ATTENTION_FUNCTIONS.keys())
+>>> ['flash_attention_2', 'flex_attention', 'sdpa']
+
+>>> ALL_ATTENTION_FUNCTIONS["sdpa"]
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+>>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+# You can also globally `register` a new function directly on it
+>>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
+```
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -0,0 +1,279 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilizing the @auto_docstring Decorator
+
+The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
+
+---
+
+## 📜 How it Works
+
+The `@auto_docstring` decorator constructs docstrings by:
+
+1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
+2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
+3.  **Overriding or Adding Arguments Descriptions:**
+    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
+    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+4.  **Adding Classes and Functions Introduction:**
+    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
+    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
+5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
+6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
+7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
+8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
+
+
+---
+
+## 🚀 How to Use @auto_docstring
+
+### 1. Importing the Decorator
+Import the decorator into your modeling file:
+
+```python
+from ...utils import auto_docstring
+```
+
+### 2. Applying to Classes
+Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
+
+```python
+from transformers.modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+
+@auto_docstring
+class MyAwesomeModel(PreTrainedModel):
+    def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
+        r"""
+        custom_parameter (`int`, *optional*, defaults to 10):
+            Description of the custom_parameter for MyAwesomeModel.
+        another_custom_arg (`str`, *optional*, defaults to "default"):
+            Documentation for another unique argument.
+        """
+        super().__init__(config)
+        self.custom_parameter = custom_parameter
+        self.another_custom_arg = another_custom_arg
+        # ... rest of your init
+
+    # ... other methods
+```
+
+#### Advanced Class Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+    custom_args="""
+    custom_parameter (`type`, *optional*, defaults to `default_value`):
+        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+    """
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        # ...
+```
+
+Or:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        r"""
+        custom_parameter (`type`, *optional*, defaults to `default_value`):
+            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+        """
+        # ...
+```
+
+### 3. Applying to Functions (e.g., `forward` method)
+Apply the decorator above method definitions, such as the `forward` method.
+
+```python
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        new_custom_argument: Optional[torch.Tensor] = None,
+        arg_documented_in_args_doc: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
+        r"""
+        new_custom_argument (`torch.Tensor`, *optional*):
+            Description of this new custom argument and its expected shape or type.
+        """
+        # ...
+```
+
+#### Advanced Function Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
+
+```python
+MODEL_COMMON_CUSTOM_ARGS = r"""
+    common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_1
+    common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_2
+    ...
+"""
+
+class MyModel(PreTrainedModel):
+    # ...
+    @auto_docstring(
+        custom_intro="""
+        This is a custom introduction for the function.
+        """
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        common_arg_1: Optional[torch.Tensor] = None,
+        common_arg_2: Optional[torch.Tensor] = None,
+        #...
+        function_specific_argument: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> torch.Tensor:
+        r"""
+        function_specific_argument (`torch.Tensor`, *optional*):
+            Description of an argument specific to this function
+
+        Returns:
+            `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
+
+        Example:
+
+        (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
+
+        ```python
+        ...
+        ```
+        """
+        # ...
+```
+
+---
+
+### ✍️ Documenting Arguments: Approach & Priority
+
+1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
+    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
+
+2.  **New or Custom Arguments:**
+    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
+    * **Format:**
+        ```
+        argument_name (`type`, *optional*, defaults to `X`):
+            Description of the argument.
+            Explain its purpose, expected shape/type if complex, and default behavior.
+            This can span multiple lines.
+        ```
+    * Include `type` in backticks.
+    * Add "*optional*" if the argument is not required (has a default value).
+    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
+
+3.  **Overriding Standard Arguments:**
+    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
+    * The `labels` argument is often customized per model and typically requires a specific docstring.
+
+4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
+    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+
+---
+
+### Usage with [modular files](./modular_transformers)
+
+When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
+
+- **For standalone models in modular files:**
+  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
+
+- **For models inheriting from other library models:**
+  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
+  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
+
+  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
+
+
+**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
+
+---
+
+## ✅ Checking Your Docstrings with `check_auto_docstrings`
+
+The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
+
+#### What it Checks:
+
+* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
+* **Argument Completeness & Consistency:**
+    * Flags arguments in the signature that are not known standard arguments and lack a local description.
+    * Ensures documented arguments exist in the signature. (TODO)
+    * Verifies that types and default values in the docstring match the signature. (TODO)
+* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
+* **Formatting:** Adherence to the expected docstring style.
+
+#### Running the Check Locally:
+
+Run this check locally before committing. The common command is:
+
+```bash
+make fix-copies
+```
+
+Alternatively, to only perform docstrings and auto-docstring checks, you can use:
+
+```bash
+python utils/check_docstrings.py # to only check files included in the diff without fixing them
+# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
+# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
+```
+
+#### Workflow with the Checker:
+
+1.  Add `@auto_docstring(...)` to the class or method.
+2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
+3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
+    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
+4.  Manually edit these placeholders with accurate types and descriptions.
+5.  Re-run the check to ensure all issues are resolved.
+
+---
+
+## 🔑 Key Takeaways & Best Practices
+
+* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
+* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
+* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
+* Document new or custom arguments clearly.
+* Run `check_docstrings` locally and iteratively.
+
+By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/autoclass_tutorial.md
+++ b/docs/source/en/autoclass_tutorial.md
@ -1,189 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Load pretrained instances with an AutoClass
-
-With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infers and loads the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different.
-
-<Tip>
-
-Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/google-bert/bert-base-uncased) is an architecture, while `google-bert/bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
-
-</Tip>
-
-In this tutorial, learn to:
-
-* Load a pretrained tokenizer.
-* Load a pretrained image processor
-* Load a pretrained feature extractor.
-* Load a pretrained processor.
-* Load a pretrained model.
-* Load a model as a backbone.
-
-## AutoTokenizer
-
-Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model.
-
-Load a tokenizer with [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
-```
-
-Then tokenize your input as shown below:
-
-```py
->>> sequence = "In a hole in the ground there lived a hobbit."
->>> print(tokenizer(sequence))
-{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-## AutoImageProcessor
-
-For vision tasks, an image processor processes the image into the correct input format.
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-```
-
-## AutoBackbone
-
-<div style="text-align: center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stages.png">
-    <figcaption class="mt-2 text-center text-sm text-gray-500">A Swin backbone with multiple stages for outputting a feature map.</figcaption>
-</div>
-
-The [`AutoBackbone`] lets you use pretrained models as backbones to get feature maps from different stages of the backbone. You should specify one of the following parameters in [`~PretrainedConfig.from_pretrained`]:
-
-* `out_indices` is the index of the layer you'd like to get the feature map from
-* `out_features` is the name of the layer you'd like to get the feature map from
-
-These parameters can be used interchangeably, but if you use both, make sure they're aligned with each other! If you don't pass any of these parameters, the backbone returns the feature map from the last layer.
-
-<div style="text-align: center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png">
-    <figcaption class="mt-2 text-center text-sm text-gray-500">A feature map from the first stage of the backbone. The patch partition refers to the model stem.</figcaption>
-</div>
-
-For example, in the above diagram, to return the feature map from the first stage of the Swin backbone, you can set `out_indices=(1,)`:
-
-```py
->>> from transformers import AutoImageProcessor, AutoBackbone
->>> import torch
->>> from PIL import Image
->>> import requests
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
->>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
->>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
-
->>> inputs = processor(image, return_tensors="pt")
->>> outputs = model(**inputs)
->>> feature_maps = outputs.feature_maps
-```
-
-Now you can access the `feature_maps` object from the first stage of the backbone:
-
-```py
->>> list(feature_maps[0].shape)
-[1, 96, 56, 56]
-```
-
-## AutoFeatureExtractor
-
-For audio tasks, a feature extractor processes the audio signal into the correct input format.
-
-Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained(
-...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-## AutoProcessor
-
-Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them.
-
-Load a processor with [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-```
-
-## AutoModel
-
-<frameworkcontent>
-<pt>
-The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`].
-
-> [!WARNING]
-> By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto")
-```
-
-Easily reuse the same checkpoint to load an architecture for a different task:
-
-```py
->>> from transformers import AutoModelForTokenClassification
-
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto")
-```
-
-<Tip warning={true}>
-
-For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG.
-
-TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue.
-
-</Tip>
-
-Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
-</pt>
-<tf>
-Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Easily reuse the same checkpoint to load an architecture for a different task:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
-</tf>
-</frameworkcontent>
--- a/docs/source/en/backbones.md
+++ b/docs/source/en/backbones.md
@ -0,0 +1,155 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Backbones
+
+Higher-level computer visions tasks, such as object detection or image segmentation, use several models together to generate a prediction. A separate model is used for the *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png"/>
+</div>
+
+Load a backbone with [`~PretrainedConfig.from_pretrained`] and use the `out_indices` parameter to determine which layer, given by the index, to extract a feature map from.
+
+```py
+from transformers import AutoBackbone
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+```
+
+This guide describes the backbone class, backbones from the [timm](https://hf.co/docs/timm/index) library, and how to extract features with them.
+
+## Backbone classes
+
+There are two backbone classes.
+
+- [`~transformers.utils.BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices.
+- [`~transformers.utils.BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration.
+
+Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone.
+
+There are two ways to load a Transformers backbone, [`AutoBackbone`] and a model-specific backbone class.
+
+<hfoptions id="backbone-classes">
+<hfoption id="AutoBackbone">
+
+The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~PretrainedConfig.from_pretrained`] as a backbone if it's supported.
+
+Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you know the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they refer to the same layer.
+
+When `out_indices` or `out_features` isn't used, the backbone returns the feature map from the last layer. The example code below uses `out_indices=(1,)` to get the feature map from the first layer.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png"/>
+</div>
+
+```py
+from transformers import AutoImageProcessor, AutoBackbone
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+```
+
+</hfoption>
+<hfoption id="model-specific backbone">
+
+When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Pass the configuration to the model to initialize it for a task.
+
+The example below loads a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head.
+
+Set `backbone` to a pretrained model and  `use_pretrained_backbone=True` to use pretrained weights instead of randomly initialized weights.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+Another option is to separately load the backbone configuration and then pass it to `backbone_config` in the model configuration.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
+
+# instantiate backbone configuration
+backbone_config = ResNetConfig()
+# load backbone in model
+config = MaskFormerConfig(backbone_config=backbone_config)
+# attach backbone to model head
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+</hfoption>
+</hfoptions>
+
+## timm backbones
+
+[timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes.
+
+Set `use_timm_backbone=True` to load pretrained timm weights, and `use_pretrained_backbone` to use pretrained or randomly initialized weights.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=True)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+You could also explicitly call the [`TimmBackboneConfig`] class to load and create a pretrained timm backbone.
+
+```py
+from transformers import TimmBackboneConfig
+
+backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True)
+```
+
+Pass the backbone configuration to the model configuration and instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+## Feature extraction
+
+The backbone is used to extract image features. Pass an image through the backbone to get the feature maps.
+
+Load and preprocess an image and pass it to the backbone. The example below extracts the feature maps from the first layer.
+
+```py
+from transformers import AutoImageProcessor, AutoBackbone
+import torch
+from PIL import Image
+import requests
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(image, return_tensors="pt")
+outputs = model(**inputs)
+```
+
+The features are stored and accessed from the outputs `feature_maps` attribute.
+
+```py
+feature_maps = outputs.feature_maps
+list(feature_maps[0].shape)
+[1, 96, 56, 56]
+```
--- a/docs/source/en/bertology.md
+++ b/docs/source/en/bertology.md
@ -1,41 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# BERTology
-
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
-(that some call "BERTology"). Some good examples of this field are:
-
-
- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
-  https://arxiv.org/abs/1905.05950
- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
-  Manning: https://arxiv.org/abs/1906.04341
- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633
-
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
-help people access the inner representations, mainly adapted from the great work of Paul Michel
-(https://arxiv.org/abs/1905.10650):
-
-
- accessing all the hidden-states of BERT/GPT/GPT-2,
- accessing all the attention weights for each head of BERT/GPT/GPT-2,
- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
-  in https://arxiv.org/abs/1905.10650.
-
-To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) which extracts information and prune a model pre-trained on
-GLUE.
--- a/docs/source/en/big_models.md
+++ b/docs/source/en/big_models.md
@ -1,215 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Instantiate a big model
-
-A barrier to accessing very large pretrained models is the amount of memory required. When loading a pretrained PyTorch model, you usually:
-
-1. Create a model with random weights.
-2. Load your pretrained weights.
-3. Put those pretrained weights in the model.
-
-The first two steps both require a full version of the model in memory and if the model weighs several GBs, you may not have enough memory for two copies of it. This problem is amplified in distributed training environments because each process loads a pretrained model and stores two copies in memory.
-
-> [!TIP]
-> The randomly created model is initialized with "empty" tensors, which take space in memory without filling it. The random values are whatever was in this chunk of memory at the time. To improve loading speed, the [`_fast_init`](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter is set to `True` by default to skip the random initialization for all weights that are correctly loaded.
-
-This guide will show you how Transformers can help you load large pretrained models despite their memory requirements.
-
-## Sharded checkpoints
-
-From Transformers v4.18.0, a checkpoint larger than 10GB is automatically sharded by the [`~PreTrainedModel.save_pretrained`] method. It is split into several smaller partial checkpoints and creates an index file that maps parameter names to the files they're stored in.
-
-The maximum shard size is controlled with the `max_shard_size` parameter, but by default it is 5GB, because it is easier to run on free-tier GPU instances without running out of memory.
-
-For example, let's shard [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B).
-
-```py
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     print(sorted(os.listdir(tmp_dir)))
-['config.json', 'generation_config.json', 'model-00001-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model.safetensors.index.json']
-```
-
-The sharded checkpoint is reloaded with the [`~PreTrainedModel.from_pretrained`] method.
-
-```py
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     new_model = AutoModel.from_pretrained(tmp_dir)
-```
-
-The main advantage of sharded checkpoints for big models is that each shard is loaded after the previous one, which caps the memory usage to only the model size and the largest shard size.
-
-You could also directly load a sharded checkpoint inside a model without the [`~PreTrainedModel.from_pretrained`] method (similar to PyTorch's `load_state_dict()` method for a full checkpoint). In this case, use the [`~modeling_utils.load_sharded_checkpoint`] method.
-
-```py
->>> from transformers.modeling_utils import load_sharded_checkpoint
-
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     load_sharded_checkpoint(model, tmp_dir)
-```
-
-### Shard metadata
-
-The index file determines which keys are in the checkpoint and where the corresponding weights are stored. This file is loaded like any other JSON file and you can get a dictionary from it.
-
-```py
->>> import json
-
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     with open(os.path.join(tmp_dir, "model.safetensors.index.json"), "r") as f:
-...         index = json.load(f)
-
->>> print(index.keys())
-dict_keys(['metadata', 'weight_map'])
-```
-
-The `metadata` key provides the total model size.
-
-```py
->>> index["metadata"]
-{'total_size': 28966928384}
-```
-
-The `weight_map` key maps each parameter name (typically `state_dict` in a PyTorch model) to the shard it's stored in.
-
-```py
->>> index["weight_map"]
-{'lm_head.weight': 'model-00006-of-00006.safetensors',
- 'model.embed_tokens.weight': 'model-00001-of-00006.safetensors',
- 'model.layers.0.input_layernorm.weight': 'model-00001-of-00006.safetensors',
- 'model.layers.0.mlp.down_proj.weight': 'model-00001-of-00006.safetensors',
- ...
-}
-```
-
-## Accelerate's Big Model Inference
-
-> [!TIP]
-> Make sure you have Accelerate v0.9.0 or later and PyTorch v1.9.0 or later installed.
-
-From Transformers v4.20.0, the [`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature to efficiently handle really big models! Big Model Inference creates a *model skeleton* on PyTorch's [**meta**](https://pytorch.org/docs/main/meta.html) device. The randomly initialized parameters are only created when the pretrained weights are loaded. This way, you aren't keeping two copies of the model in memory at the same time (one for the randomly initialized model and one for the pretrained weights), and the maximum memory consumed is only the full model size.
-
-To enable Big Model Inference in Transformers, set `low_cpu_mem_usage=True` in the [`~PreTrainedModel.from_pretrained`] method.
-
-```py
-from transformers import AutoModelForCausalLM
-
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", low_cpu_mem_usage=True)
-```
-
-Accelerate automatically dispatches the model weights across all available devices, starting with the fastest device (GPU) first and then offloading to the slower devices (CPU and even hard drive). This is enabled by setting `device_map="auto"` in the [`~PreTrainedModel.from_pretrained`] method. When you pass the `device_map` parameter, `low_cpu_mem_usage` is automatically set to `True` so you don't need to specify it.
-
-```py
-from transformers import AutoModelForCausalLM
-
-# these loading methods are equivalent
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto")
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto", low_cpu_mem_usage=True)
-```
-
-You can also write your own `device_map` by mapping each layer to a device. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device.
-
-```python
-device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"}
-```
-
-Access `hf_device_map` attribute to see how Accelerate split the model across devices.
-
-```py
-gemma.hf_device_map
-```
-
-```python out
-{'model.embed_tokens': 0,
- 'model.layers.0': 0,
- 'model.layers.1': 0,
- 'model.layers.2': 0,
- 'model.layers.3': 0,
- 'model.layers.4': 0,
- 'model.layers.5': 0,
- 'model.layers.6': 0,
- 'model.layers.7': 0,
- 'model.layers.8': 0,
- 'model.layers.9': 0,
- 'model.layers.10': 0,
- 'model.layers.11': 0,
- 'model.layers.12': 0,
- 'model.layers.13': 0,
- 'model.layers.14': 'cpu',
- 'model.layers.15': 'cpu',
- 'model.layers.16': 'cpu',
- 'model.layers.17': 'cpu',
- 'model.layers.18': 'cpu',
- 'model.layers.19': 'cpu',
- 'model.layers.20': 'cpu',
- 'model.layers.21': 'cpu',
- 'model.layers.22': 'cpu',
- 'model.layers.23': 'cpu',
- 'model.layers.24': 'cpu',
- 'model.layers.25': 'cpu',
- 'model.layers.26': 'cpu',
- 'model.layers.27': 'cpu',
- 'model.layers.28': 'cpu',
- 'model.layers.29': 'cpu',
- 'model.layers.30': 'cpu',
- 'model.layers.31': 'cpu',
- 'model.norm': 'cpu',
- 'lm_head': 'cpu'}
-```
-
-## Model data type
-
-PyTorch model weights are normally instantiated as torch.float32 and it can be an issue if you try to load a model as a different data type. For example, you'd need twice as much memory to load the weights in torch.float32 and then again to load them in your desired data type, like torch.float16.
-
-> [!WARNING]
-> Due to how PyTorch is designed, the `torch_dtype` parameter only supports floating data types.
-
-To avoid wasting memory like this, explicitly set the `torch_dtype` parameter to the desired data type or set `torch_dtype="auto"` to load the weights with the most optimal memory pattern (the data type is automatically derived from the model weights).
-
-<hfoptions id="dtype">
-<hfoption id="specific dtype">
-
-```py
-from transformers import AutoModelForCausalLM
-
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16)
-```
-
-</hfoption>
-<hfoption id="auto dtype">
-
-```py
-from transformers import AutoModelForCausalLM
-
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype="auto")
-```
-
-</hfoption>
-</hfoptions>
-
-You can also set the data type to use for models instantiated from scratch.
-
-```python
-import torch
-from transformers import AutoConfig, AutoModel
-
-my_config = AutoConfig.from_pretrained("google/gemma-2b", torch_dtype=torch.float16)
-model = AutoModel.from_config(my_config)
-```
--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -0,0 +1,96 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Caching
+
+Imagine you’re having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
+
+You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
+
+To predict the 1000th token, the model requires information from the previous 999 tokens. The information is represented as matrix multiplications across the token representations.
+
+To predict the 1001th token, you need the same information from the previous 999 tokens in addition to any information from the 1000th token. This is a lot of matrix multiplications a model has to compute over and over for each token!
+
+A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute.
+
+> [!WARNING]
+> Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training.
+
+## Cache class
+
+When you use Transformers' [`Cache`] class, the self-attention module performs several critical steps to integrate past and present information.
+
+1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input.
+
+2. When the `forward` method is called iteratively, it's crucial that the attention mask shape matches the combined length of the past and current kv pairs. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is typically handled internally in [`~GenerationMixin.generate`], but if you want to implement your own generation loop with [`Cache`], keep this in mind! The attention mask should hold the past and current token values.
+
+3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10])`.
+
+The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+past_key_values = DynamicCache()
+messages = [{"role": "user", "content": "Hello, what's your name."}]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+
+generated_ids = inputs.input_ids
+cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
+max_new_tokens = 10
+
+for _ in range(max_new_tokens):
+    outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True)
+    # Greedily sample one next token
+    next_token_ids = outputs.logits[:, -1:].argmax(-1)
+    generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
+    # Prepare inputs for the next generation step by leaving unprocessed tokens, in our case we have only one new token
+    # and expanding attn mask for the new token, as explained above
+    attention_mask = inputs["attention_mask"]
+    attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+    inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask}
+    cache_position = cache_position[-1:] + 1 # add one more position for the next token
+
+print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
+"[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
+```
+
+## Legacy cache format
+
+Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format has is dynamic because it grows as text is generated, similar to [`DynamicCache`].
+
+If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+
+# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
+# in the legacy format
+generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
+
+cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
+legacy_format_cache = cache.to_legacy_cache()
+```
--- a/docs/source/en/chat_extras.md
+++ b/docs/source/en/chat_extras.md
@ -0,0 +1,299 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tools and RAG
+
+The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases.
+
+This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG).
+
+## Tools
+
+Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases.
+
+Follow the rules below when creating a tool.
+
+1. The function should have a descriptive name.
+2. The function arguments must have a type hint in the function header (don't include in the `Args` block).
+3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring.
+4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them.
+
+An example tool to get temperature and wind speed is shown below.
+
+```py
+def get_current_temperature(location: str, unit: str) -> float:
+    """
+    Get the current temperature at a location.
+    
+    Args:
+        location: The location to get the temperature for, in the format "City, Country"
+        unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
+    Returns:
+        The current temperature at the specified location in the specified units, as a float.
+    """
+    return 22.  # A real function should probably actually get the temperature!
+
+def get_current_wind_speed(location: str) -> float:
+    """
+    Get the current wind speed in km/h at a given location.
+    
+    Args:
+        location: The location to get the temperature for, in the format "City, Country"
+    Returns:
+        The current wind speed at the given location in km/h, as a float.
+    """
+    return 6.  # A real function should probably actually get the wind speed!
+
+tools = [get_current_temperature, get_current_wind_speed]
+```
+
+Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+Create a chat message.
+
+```py
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+outputs = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature. 
+
+Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`.
+
+> [!WARNING]
+> The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
+
+<hfoptions id="tool-call">
+<hfoption id="Llama">
+
+```py
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+
+Allow the assistant to read the function outputs and chat with the user.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
+```
+
+</hfoption>
+<hfoption id="Mistral/Mixtral">
+
+For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary.
+
+```py
+tool_call_id = "9Ae3bDc2F"
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
+```
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+</hfoption>
+</hfoptions>
+
+## Schema
+
+[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the function works technically, it only cares about function **definition** and **arguments**.
+
+The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging.
+
+```py
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    A function that multiplies two numbers
+    
+    Args:
+        a: The first number to multiply
+        b: The second number to multiply
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions.
+
+> [!WARNING]
+> Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments.
+
+The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`].
+
+```py
+# A simple function that takes no arguments
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# A more complete function that takes two numerical arguments
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## RAG
+
+Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys.
+
+> [!TIP]
+> The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates.
+
+Create a list of documents to pass to the model.
+
+```py
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+```
+
+Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# Load the model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
+device = model.device # Get the device the model is loaded on
+
+# Define conversation input
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# Generate a response 
+generated_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# Decode and print the generated text along with generation prompt
+generated_text = tokenizer.decode(generated_tokens[0])
+print(generated_text)
+```
--- a/docs/source/en/chat_template_advanced.md
+++ b/docs/source/en/chat_template_advanced.md
@ -1,463 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Advanced Usage and Customizing Your Chat Templates
-
-In this page, we’ll explore more advanced techniques for working with chat templates in Transformers. Whether you’re looking to write your own templates, create custom components, or optimize your templates for efficiency, we’ll cover everything you need to take your templates to the next level. Let’s dive into the tools and strategies that will help you get the most out of your chat models.
-
-
-## How do chat templates work?
-
-The chat template for a model is stored on the `tokenizer.chat_template` attribute. Let's take a look at a `Zephyr` chat template, though note this
-one is a little simplified from the actual one!
-
-```
-{%- for message in messages %}
-    {{- '<|' + message['role'] + '|>\n' }}
-    {{- message['content'] + eos_token }}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|assistant|>\n' }}
-{%- endif %}
-```
-
-If you've never seen one of these before, this is a [Jinja template](https://jinja.palletsprojects.com/en/3.1.x/templates/).
-Jinja is a templating language that allows you to write simple code that generates text. In many ways, the code and
-syntax resembles Python. In pure Python, this template would look something like this:
-
-```python
-for message in messages:
-    print(f'<|{message["role"]}|>')
-    print(message['content'] + eos_token)
-if add_generation_prompt:
-    print('<|assistant|>')
-```
-
-Effectively, the template does three things:
-1. For each message, print the role enclosed in `<|` and `|>`, like `<|user|>` or `<|assistant|>`.
-2. Next, print the content of the message, followed by the end-of-sequence token.
-3. Finally, if `add_generation_prompt` is set, print the assistant token, so that the model knows to start generating
-   an assistant response.
-
-This is a pretty simple template but Jinja gives you a lot of flexibility to do more complex things! Let's see a Jinja
-template that can format inputs similarly to the way LLaMA formats them (note that the real LLaMA template includes 
-handling for default system messages and slightly different system message handling in general - don't use this one 
-in your actual code!)
-
-```
-{%- for message in messages %}
-    {%- if message['role'] == 'user' %}
-        {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
-    {%- elif message['role'] == 'system' %}
-        {{- '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
-    {%- elif message['role'] == 'assistant' %}
-        {{- ' '  + message['content'] + ' ' + eos_token }}
-    {%- endif %}
-{%- endfor %}
-```
-
-Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens like
-`[INST]` and `[/INST]` based on the role of each message. User, assistant and system messages are clearly
-distinguishable to the model because of the tokens they're wrapped in.
-
-
-## How do I create a chat template?
-
-Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an 
-existing template from another model and simply edit it for your needs! For example, we could take the LLaMA template
-above and add "[ASST]" and "[/ASST]" to assistant messages:
-
-```
-{%- for message in messages %}
-    {%- if message['role'] == 'user' %}
-        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
-    {%- elif message['role'] == 'system' %}
-        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
-    {%- elif message['role'] == 'assistant' %}
-        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
-    {%- endif %}
-{%- endfor %}
-```
-
-Now, simply set the `tokenizer.chat_template` attribute. Next time you use [`~PreTrainedTokenizer.apply_chat_template`], it will
-use your new template! This attribute will be saved in the `tokenizer_config.json` file, so you can use
-[`~utils.PushToHubMixin.push_to_hub`] to upload your new template to the Hub and make sure everyone's using the right
-template for your model!
-
-```python
-template = tokenizer.chat_template
-template = template.replace("SYS", "SYSTEM")  # Change the system token
-tokenizer.chat_template = template  # Set the new template
-tokenizer.push_to_hub("model_name")  # Upload your new template to the Hub!
-```
-
-The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`TextGenerationPipeline`] class, so 
-once you set the correct chat template, your model will automatically become compatible with [`TextGenerationPipeline`].
-
-<Tip>
-If you're fine-tuning a model for chat, in addition to setting a chat template, you should probably add any new chat
-control tokens as special tokens in the tokenizer. Special tokens are never split, 
-ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. You 
-should also set the tokenizer's `eos_token` attribute to the token that marks the end of assistant generations in your
-template. This will ensure that text generation tools can correctly figure out when to stop generating text.
-</Tip>
-
-
-## Why do some models have multiple templates?
-
-Some models use different templates for different use cases. For example, they might use one template for normal chat
-and another for tool-use, or retrieval-augmented generation. In these cases, `tokenizer.chat_template` is a dictionary.
-This can cause some confusion, and where possible, we recommend using a single template for all use-cases. You can use
-Jinja statements like `if tools is defined` and `{% macro %}` definitions to easily wrap multiple code paths in a
-single template.
-
-When a tokenizer has multiple templates, `tokenizer.chat_template` will be a `dict`, where each key is the name
-of a template. The `apply_chat_template` method has special handling for certain template names: Specifically, it will
-look for a template named `default` in most cases, and will raise an error if it can't find one. However, if a template
-named `tool_use` exists when the user has passed a `tools` argument, it will use that instead. To access templates
-with other names, pass the name of the template you want to the `chat_template` argument of
-`apply_chat_template()`.
-
-We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend
-trying to put it all in a single template where possible!
-
-
-## What template should I use?
-
-When setting the template for a model that's already been trained for chat, you should ensure that the template
-exactly matches the message formatting that the model saw during training, or else you will probably experience
-performance degradation. This is true even if you're training the model further - you will probably get the best 
-performance if you keep the chat tokens constant. This is very analogous to tokenization - you generally get the
-best performance for inference or fine-tuning when you precisely match the tokenization used during training.
-
-If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand,
-you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different
-input formats. One popular choice is the `ChatML` format, and this is a good, flexible choice for many use-cases. 
-It looks like this:
-
-```
-{%- for message in messages %}
-    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
-{%- endfor %}
-```
-
-If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes
-handy support for [generation prompts](#what-are-generation-prompts), but note that it doesn't add BOS or EOS tokens!
-If your model expects those, they won't be added automatically by `apply_chat_template` - in other words, the
-text will be tokenized with `add_special_tokens=False`. This is to avoid potential conflicts between the template and
-the `add_special_tokens` logic. If your model expects special tokens, make sure to add them to the template!
-
-```python
-tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
-```
-
-This template wraps each message in `<|im_start|>` and `<|im_end|>` tokens, and simply writes the role as a string, which
-allows for flexibility in the roles you train with. The output looks like this:
-
-```text
-<|im_start|>system
-You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
-<|im_start|>user
-How are you?<|im_end|>
-<|im_start|>assistant
-I'm doing great!<|im_end|>
-```
-
-The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense,
-particularly if you want your model to operate well with [`TextGenerationPipeline`]. However, you are not limited
-to these roles - templating is extremely flexible, and any string can be a role.
-
-## I want to add some chat templates! How should I get started?
-
-If you have any chat models, you should set their `tokenizer.chat_template` attribute and test it using
-[`~PreTrainedTokenizer.apply_chat_template`], then push the updated tokenizer to the Hub. This applies even if you're
-not the model owner - if you're using a model with an empty chat template, or one that's still using the default class
-template, please open a [pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) to the model repository so that this attribute can be set properly!
-
-Once the attribute is set, that's it, you're done! `tokenizer.apply_chat_template` will now work correctly for that
-model, which means it is also automatically supported in places like `TextGenerationPipeline`!
-
-By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of
-open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long - 
-it's time to put an end to them!
-
-
-<Tip>
-
-The easiest way to get started with writing Jinja templates is to take a look at some existing ones. You can use
-`print(tokenizer.chat_template)` for any chat model to see what template it's using. In general, models that support tool use have 
-much more complex templates than other models - so when you're just getting started, they're probably a bad example
-to learn from! You can also take a look at the 
-[Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for details
-of general Jinja formatting and syntax.
-
-</Tip>
-
-Jinja templates in `transformers` are identical to Jinja templates elsewhere. The main thing to know is that 
-the conversation history will be accessible inside your template as a variable called `messages`.  
-You will be able to access `messages` in your template just like you can in Python, which means you can loop over 
-it with `{% for message in messages %}` or access individual messages with `{{ messages[0] }}`, for example.
-
-You can also use the following tips to write clean, efficient Jinja templates:
-
-### Trimming whitespace
-
-By default, Jinja will print any whitespace that comes before or after a block. This can be a problem for chat
-templates, which generally want to be very precise with whitespace! To avoid this, we strongly recommend writing
-your templates like this:
-
-```
-{%- for message in messages %}
-    {{- message['role'] + message['content'] }}
-{%- endfor %}
-```
-
-rather than like this:
-
-```
-{% for message in messages %}
-    {{ message['role'] + message['content'] }}
-{% endfor %}
-```
-
-Adding `-` will strip any whitespace that comes before the block. The second example looks innocent, but the newline
-and indentation may end up being included in the output, which is probably not what you want!
-
-### Special variables
-
-Inside your template, you will have access several special variables. The most important of these is `messages`, 
-which contains the chat history as a list of message dicts. However, there are several others. Not every
-variable will be used in every template. The most common other variables are:
-
- `tools` contains a list of tools in JSON schema format. Will be `None` or undefined if no tools are passed.
- `documents` contains a list of documents in the format `{"title": "Title", "contents": "Contents"}`, used for retrieval-augmented generation. Will be `None` or undefined if no documents are passed.
- `add_generation_prompt` is a bool that is `True` if the user has requested a generation prompt, and `False` otherwise. If this is set, your template should add the header for an assistant message to the end of the conversation. If your model doesn't have a specific header for assistant messages, you can ignore this flag.
- **Special tokens** like `bos_token` and `eos_token`. These are extracted from `tokenizer.special_tokens_map`. The exact tokens available inside each template will differ depending on the parent tokenizer.
-
-<Tip>
-
-You can actually pass any `kwarg` to `apply_chat_template`, and it will be accessible inside the template as a variable. In general,
-we recommend trying to stick to the core variables above, as it will make your model harder to use if users have
-to write custom code to pass model-specific `kwargs`. However, we're aware that this field moves quickly, so if you
-have a new use-case that doesn't fit in the core API, feel free to use a new `kwarg` for it! If a new `kwarg`
-becomes common we may promote it into the core API and create a standard, documented format for it.
-
-</Tip>
-
-### Callable functions
-
-There is also a short list of callable functions available to you inside your templates. These are:
-
- `raise_exception(msg)`: Raises a `TemplateException`. This is useful for debugging, and for telling users when they're
-doing something that your template doesn't support.
- `strftime_now(format_str)`: Equivalent to `datetime.now().strftime(format_str)` in Python. This is used for getting
-the current date/time in a specific format, which is sometimes included in system messages.
-
-### Compatibility with non-Python Jinja
-
-There are multiple implementations of Jinja in various languages. They generally have the same syntax,
-but a key difference is that when you're writing a template in Python you can use Python methods, such as
-`.lower()` on strings or `.items()` on dicts. This will break if someone tries to use your template on a non-Python
-implementation of Jinja. Non-Python implementations are particularly common in deployment environments, where JS
-and Rust are very popular. 
-
-Don't panic, though! There are a few easy changes you can make to your templates to ensure they're compatible across
-all implementations of Jinja:
-
- Replace Python methods with Jinja filters. These usually have the same name, for example `string.lower()` becomes
-  `string|lower`, and `dict.items()` becomes `dict|items`. One notable change is that `string.strip()` becomes `string|trim`.
-  See the [list of built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters)
-  in the Jinja documentation for more.
- Replace `True`, `False` and `None`, which are Python-specific, with `true`, `false` and `none`.
- Directly rendering a dict or list may give different results in other implementations (for example, string entries
-  might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here.
-
-### Writing generation prompts
-
-We mentioned above that `add_generation_prompt` is a special variable that will be accessible inside your template,
-and is controlled by the user setting the `add_generation_prompt` flag. If your model expects a header for
-assistant messages, then your template must support adding the header when `add_generation_prompt` is set.
-
-Here is an example of a template that formats messages ChatML-style, with generation prompt support:
-
-```text
-{{- bos_token }}
-{%- for message in messages %}
-    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}
-```
-
-The exact content of the assistant header will depend on your specific model, but it should always be **the string
-that represents the start of an assistant message**, so that if the user applies your template with 
-`add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some
-models do not need a generation prompt, because assistant messages always begin immediately after user messages. 
-This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]`
-token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag.
-
-Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then
-model generations will likely be severely degraded, or the model may display unusual behaviour like continuing 
-the final user message! 
-
-### Writing and debugging larger templates
-
-When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. 
-However, with new models and features like tool-use and RAG, some templates can be 100 lines long or more. When
-writing templates like these, it's a good idea to write them in a separate file, using a text editor. You can easily 
-extract a chat template to a file:
-
-```python
-open("template.jinja", "w").write(tokenizer.chat_template)
-```
-
-Or load the edited template back into the tokenizer:
-
-```python
-tokenizer.chat_template = open("template.jinja").read()
-```
-
-As an added bonus, when you write a long, multi-line template in a separate file, line numbers in that file will
-exactly correspond to line numbers in template parsing or execution errors. This will make it much easier to
-identify the source of issues.
-
-
-
-## Writing templates for tools
-
-Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend 
-template authors try to stick to a standard API where possible. The whole point of chat templates is to allow code
-to be transferable across models, so deviating from the standard tools API means users will have to write
-custom code to use tools with your model. Sometimes it's unavoidable, but often with clever templating you can
-make the standard API work!
-
-Below, we'll list the elements of the standard API, and give tips on writing templates that will work well with it.
-
-### Tool definitions
-
-Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list 
-of JSON schema dicts. Our chat template methods allow users to pass tools as either JSON schema or Python functions, but when
-functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the 
-`tools` variable that your template receives will always be a list of JSON schema. Here is
-a sample tool JSON schema:
-
-```json
-{
-  "type": "function", 
-  "function": {
-    "name": "multiply", 
-    "description": "A function that multiplies two numbers", 
-    "parameters": {
-      "type": "object", 
-      "properties": {
-        "a": {
-          "type": "number", 
-          "description": "The first number to multiply"
-        }, 
-        "b": {
-          "type": "number",
-          "description": "The second number to multiply"
-        }
-      }, 
-      "required": ["a", "b"]
-    }
-  }
-}
-```
-
-And here is some example code for handling tools in your chat template. Remember, this is just an example for a
-specific format - your model will probably need different formatting!
-
-```text
-{%- if tools %}
-    {%- for tool in tools %}
-        {{- '<tool>' + tool['function']['name'] + '\n' }}
-        {%- for argument in tool['function']['parameters']['properties'] %}
-            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
-        {%- endfor %}
-        {{- '\n</tool>' }}
-    {%- endif %}
-{%- endif %}
-```
-
-The specific tokens and tool descriptions your template renders should of course be chosen to match the ones your model
-was trained with. There is no requirement that your **model** understands JSON schema input, only that your template can translate
-JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) 
-was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema, 
-converts types internally and renders the input tools as Python headers. You can do a lot with templates!
-
-### Tool calls
-
-Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is 
-always a list, even though most tool-calling models only support single tool calls at a time, which means
-the list will usually only have a single element. Here is a sample message dict containing a tool call:
-
-```json
-{
-  "role": "assistant",
-  "tool_calls": [
-    {
-      "type": "function",
-      "function": {
-        "name": "multiply",
-        "arguments": {
-          "a": 5,
-          "b": 6
-        }
-      }
-    }
-  ]
-}
-```
-
-And a common pattern for handling them would be something like this:
-
-```text
-{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
-    {%- for tool_call in message['tool_calls'] %}
-            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
-        {%- endif %}
-    {%- endfor %}
-{%- endif %}
-```
-
-Again, you should render the tool call with the formatting and special tokens that your model expects.
-
-### Tool responses
-
-Tool responses have a simple format: They are a message dict with the "tool" role, a "name" key giving the name
-of the called function, and a "content" key containing the result of the tool call. Here is a sample tool response:
-
-```json
-{
-  "role": "tool",
-  "name": "multiply",
-  "content": "30"
-}
-```
-
-You don't need to use all of the keys in the tool response. For example, if your model doesn't expect the function
-name to be included in the tool response, then rendering it can be as simple as:
-
-```text
-{%- if message['role'] == 'tool' %}
-    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
-{%- endif %}
-```
-
-Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care
-to ensure that tokens, whitespace and everything else exactly match the format your model was trained with!
--- a/docs/source/en/chat_template_basics.md
+++ b/docs/source/en/chat_template_basics.md
@ -1,287 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Getting Started with Chat Templates for Text LLMs
-
-An increasingly common use case for LLMs is **chat**. In a chat context, rather than continuing a single string
-of text (as is the case with a standard language model), the model instead continues a conversation that consists
-of one or more **messages**, each of which includes a **role**, like "user" or "assistant", as well as message text.
-
-Much like tokenization, different models expect very different input formats for chat. This is the reason we added
-**chat templates** as a feature. Chat templates are part of the tokenizer for text-only LLMs or processor for multimodal LLMs. They specify how to convert conversations, represented as lists of messages, into a single tokenizable string in the format that the model expects. 
-
-We'll explore the basic usage of chat templates with text-only LLMs in this page. For detailed guidance on multimodal models, we have a dedicated [documentation oage for multimodal models](./chat_template_multimodal), which covers how to work with image, video and audio inputs in your templates.
-
-Let's make this concrete with a quick example using the `mistralai/Mistral-7B-Instruct-v0.1` model:
-
-```python
->>> from transformers import AutoTokenizer
->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
-
->>> chat = [
-...   {"role": "user", "content": "Hello, how are you?"},
-...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
-...   {"role": "user", "content": "I'd like to show off how chat templating works!"},
-... ]
-
->>> tokenizer.apply_chat_template(chat, tokenize=False)
-"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
-```
-
-Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of 
-user messages (but not assistant messages!), and the entire chat is condensed into a single string. 
-If we use `tokenize=True`, which is the default setting, that string will also be tokenized for us.
-
-Now, try the same code, but swap in the `HuggingFaceH4/zephyr-7b-beta` model instead, and you should get:
-
-```text
-<|user|>
-Hello, how are you?</s>
-<|assistant|>
-I'm doing great. How can I help you today?</s>
-<|user|>
-I'd like to show off how chat templating works!</s>
-```
-
-Both Zephyr and Mistral-Instruct were fine-tuned from the same base model, `Mistral-7B-v0.1`. However, they were trained
-with totally different chat formats. Without chat templates, you would have to write manual formatting code for each
-model, and it's very easy to make minor errors that hurt performance! Chat templates handle the details of formatting 
-for you, allowing you to write universal code that works for any model.
-
-
-## How do I use chat templates?
-
-As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role`
-and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] or [`~ProcessorMixin.apply_chat_template`] method
-depending on what type of model you are using. Once you do that,
-you'll get output that's ready to go! When using chat templates as input for model generation, it's also a good idea
-to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts). 
-
-Here's an example of preparing input for `model.generate()`, using `Zephyr` again:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-checkpoint = "HuggingFaceH4/zephyr-7b-beta"
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint)  # You may want to use bfloat16 and/or move to GPU here
-
-messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
- ]
-tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
-print(tokenizer.decode(tokenized_chat[0]))
-```
-This will yield a string in the input format that Zephyr expects. 
-```text
-<|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
-<|user|>
-How many helicopters can a human eat in one sitting?</s> 
-<|assistant|>
-```
-
-Now that our input is formatted correctly for Zephyr, we can use the model to generate a response to the user's question:
-
-```python
-outputs = model.generate(tokenized_chat, max_new_tokens=128) 
-print(tokenizer.decode(outputs[0]))
-```
-
-This will yield:
-
-```text
-<|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
-<|user|>
-How many helicopters can a human eat in one sitting?</s> 
-<|assistant|>
-Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
-```
-
-Arr, 'twas easy after all!
-
-
-## Is there an automated pipeline for chat?
-
-Yes, there is! Our text generation pipelines support chat inputs, which makes it easy to use chat models. In the past,
-we used to use a dedicated "ConversationalPipeline" class, but this has now been deprecated and its functionality
-has been merged into the [`TextGenerationPipeline`]. Let's try the `Zephyr` example again, but this time using 
-a pipeline:
-
-```python
-from transformers import pipeline
-
-pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta")
-messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
-]
-print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])  # Print the assistant's response
-```
-
-```text
-{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all."}
-```
-
-The pipeline will take care of all the details of tokenization and calling `apply_chat_template` for you -
-once the model has a chat template, all you need to do is initialize the pipeline and pass it the list of messages!
-
-
-## What are "generation prompts"?
-
-You may have noticed that the `apply_chat_template` method has an `add_generation_prompt` argument. This argument tells
-the template to add tokens that indicate the start of a bot response. For example, consider the following chat:
-
-```python
-messages = [
-    {"role": "user", "content": "Hi there!"},
-    {"role": "assistant", "content": "Nice to meet you!"},
-    {"role": "user", "content": "Can I ask a question?"}
-]
-```
-
-Here's what this will look like without a generation prompt, for a model that uses standard "ChatML" formatting:
-
-```python
-tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
-"""<|im_start|>user
-Hi there!<|im_end|>
-<|im_start|>assistant
-Nice to meet you!<|im_end|>
-<|im_start|>user
-Can I ask a question?<|im_end|>
-"""
-```
-
-And here's what it looks like **with** a generation prompt:
-
-```python
-tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-"""<|im_start|>user
-Hi there!<|im_end|>
-<|im_start|>assistant
-Nice to meet you!<|im_end|>
-<|im_start|>user
-Can I ask a question?<|im_end|>
-<|im_start|>assistant
-"""
-```
-
-Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model
-generates text it will write a bot response instead of doing something unexpected, like continuing the user's 
-message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a 
-special kind of text to them! You need to guide them with appropriate control tokens, so they know what they're 
-supposed to be doing.
-
-Not all models require generation prompts. Some models, like LLaMA, don't have any
-special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact
-effect that `add_generation_prompt` has will depend on the template being used.
-
-
-## What does "continue_final_message" do?
-
-When passing a list of messages to `apply_chat_template` or `TextGenerationPipeline`, you can choose
-to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done
-by removing any end-of-sequence tokens that indicate the end of the final message, so that the model will simply
-extend the final message when it begins to generate text. This is useful for "prefilling" the model's response. 
-
-Here's an example:
-
-```python
-chat = [
-    {"role": "user", "content": "Can you format the answer in JSON?"},
-    {"role": "assistant", "content": '{"name": "'},
-]
-
-formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
-model.generate(**formatted_chat)
-```
-
-The model will generate text that continues the JSON string, rather than starting a new message. This approach
-can be very useful for improving the accuracy of the model's instruction-following when you know how you want
-it to start its replies.
-
-Because `add_generation_prompt` adds the tokens that start a new message, and `continue_final_message` removes any
-end-of-message tokens from the final message, it does not make sense to use them together. As a result, you'll
-get an error if you try!
-
-<Tip>
-
-The default behaviour of `TextGenerationPipeline` is to set `add_generation_prompt=True` so that it starts a new
-message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is 
-a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple 
-consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` 
-argument when calling the pipeline.
-
-</Tip>
-
-
-## Can I use chat templates in training?
-
-Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training.
-We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you
-can simply continue like any other language model training task. When training, you should usually set 
-`add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during 
-training. Let's see an example:
-
-```python
-from transformers import AutoTokenizer
-from datasets import Dataset
-
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
-
-chat1 = [
-    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
-    {"role": "assistant", "content": "The sun."}
-]
-chat2 = [
-    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
-    {"role": "assistant", "content": "A bacterium."}
-]
-
-dataset = Dataset.from_dict({"chat": [chat1, chat2]})
-dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
-print(dataset['formatted_chat'][0])
-```
-And we get:
-```text
-<|user|>
-Which is bigger, the moon or the sun?</s>
-<|assistant|>
-The sun.</s>
-```
-
-From here, just continue training like you would with a standard language modelling task, using the `formatted_chat` column.
-
-<Tip>
-
-By default, some tokenizers add special tokens like `<bos>` and `<eos>` to text they tokenize. Chat templates should 
-already include all the special tokens they need, and so additional special tokens will often be incorrect or 
-duplicated, which will hurt model performance.
-
-Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument
-`add_special_tokens=False` when you tokenize that text later. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this!
-
-</Tip>
-
--- a/docs/source/en/chat_template_multimodal.md
+++ b/docs/source/en/chat_template_multimodal.md
@ -1,289 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Multimodal Chat Templates for Vision and Audio LLMs
-
-In this section, we'll explore how to use chat templates with multimodal models, enabling your templates to handle a variety of inputs such as text, images, and audio. Multimodal models provide richer, more interactive experiences, and understanding how to effectively combine these inputs within your templates is key. We’ll walk through how to work with different modalities, configure your templates for optimal performance, and tackle common challenges along the way.
-
-Just like with text-only LLMs, multimodal models expect a chat with **messages**, each of which includes a **role** and **content**. However, for multimodal models, chat templates are a part of the [Processor](./main_cllasses/processors) class. Let's see how we can format our prompts when there are images or videos in the input along with text.
-
-
-## Image inputs
-
-For models such as [LLaVA](https://huggingface.co/llava-hf) the prompts can be formatted as below. Notice that the only difference from text-only models is that we need to also pass a placeholder for input images. To accommodate for extra modalities, each **content** is a list containing either a text or an image **type**.
-
-Let's make this concrete with a quick example using the `llava-hf/llava-onevision-qwen2-0.5b-ov-hf` model:
-
-```python
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-
-model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What are these?"},
-        ],
-    },
-]
-
-formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-print(formatted_prompt)
-```
-
-This yields a string in LLaVA's expected input format with many `<image>` tokens prepended before the text.
-```text
-'<|im_start|>system 
-<|im_start|>system 
-You are a friendly chatbot who always responds in the style of a pirate<|im_end|><|im_start|>user <image>
-What are these?<|im_end|>
-```
-
-
-### Image paths or URLs
-
-To incorporate images into your chat templates, you can pass them as file paths or URLs. This method automatically loads the image, processes it, and prepares the necessary pixel values to create ready-to-use inputs for the model. This approach simplifies the integration of images, enabling seamless multimodal functionality.
-
-Let's see how it works with an example using the same model as above. This time we'll indicate an image URL with `"url"` key in the message's **content** and ask the chat template to `tokenize` and `return_dict`. Currently, "base64", "url", and "path" are supported image sources.
-
-```python
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-
-model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
-model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id)
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-            {"type": "text", "text": "What are these?"},
-        ],
-    },
-]
-
-processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
-print(processed_chat.keys())
-```
-
-This yields a dictionary with inputs processed and ready to be further passed into [`~GenerationMixin.generate`] to generate text.
-```text
-dict_keys(["input_ids", "attention_mask", "pixel_values", "image_sizes"])
-```
-
-
-## Video inputs
-
-Some vision models support videos as inputs as well as images. The message format is very similar to the image-only models with tiny differences to handle loading videos from a URL. We can continue using the same model as before since it supports videos.
-
-### Sampling with fixed number of frames
-
-Here's an example of how to set up a conversation with video inputs. Notice the extra `kwargs` passed to `processor.apply_chat_template()`. The key parameter here is `num_frames`, which controls how many frames to sample uniformly from the video. Each model checkpoint has a maximum frame count it was trained with, and exceeding this limit can significantly impact generation quality. So, it’s important to choose a frame count that fits both the model's capacity and your computational resources. If you don't specify `num_frames`, the entire video will be loaded without any frame sampling.
-
-You also have the option to choose a specific framework to load the video, depending on your preferences or needs. Currently, we support `decord`, `pyav` (the default), `opencv`, and `torchvision`. For this example, we’ll use `decord`, as it's a bit faster than `pyav`.
-
-
-<Tip>
-
-Note that if you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend.
-
-</Tip>
-
-
-```python
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-
-model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
-model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id)
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"},
-            {"type": "text", "text": "What do you see in this video?"},
-        ],
-    },
-]
-
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-    num_frames=32,
-    video_load_backend="decord",
-)
-print(processed_chat.keys())
-```
-
-### Sampling with FPS
-
-When working with long videos, you might want to sample more frames for better representation. Instead of a fixed number of frames, you can specify `video_fps`, which determines how many frames per second to extract. For example, if a video is **10 seconds long** and you set `video_fps=2`, the model will sample **20 frames** (2 per second, uniformly spaced). 
-
-Using the above model, we need to apply chat template as follows to sample 2 frames per second.
-
-```python
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    video_fps=32,
-    video_load_backend="decord",
-)
-print(processed_chat.keys())
-```
-
-
-### Custom Frame Sampling with a Function  
-
-Not all models sample frames **uniformly** — some require more complex logic to determine which frames to use. If your model follows a different sampling strategy, you can **customize** frame selection by providing a function:  
-
-🔹 Use the `sample_indices_fn` argument to pass a **callable function** for sampling.  
-🔹 If provided, this function **overrides** standard `num_frames` and `fps` methods.  
-🔹 It receives all the arguments passed to `load_video` and must return **valid frame indices** to sample.  
-
-You should use `sample_indices_fn` when:
-
- If you need a custom sampling strategy (e.g., **adaptive frame selection** instead of uniform sampling).  
- If your model prioritizes **key moments** in a video rather than evenly spaced frames.  
-
-Here’s an example of how to implement it:  
-
-
-```python
-
-def sample_indices_fn(metadata, **kwargs):
-    # samples only the first and the second frame
-    return [0, 1]
-
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    sample_indices_fn=sample_indices_fn,
-    video_load_backend="decord",
-)
-print(processed_chat.keys())
-```
-
-By using `sample_indices_fn`, you gain **full control** over frame selection, making your model **more adaptable** to different video scenarios. 🚀  
-
-
-### List of image frames as video
-
-Sometimes, instead of having a full video file, you might only have a set of sampled frames stored as images.
-
-You can pass a list of image file paths, and the processor will automatically concatenate them into a video. Just make sure that all images have the same size, as they are assumed to be from the same video.
-
-
-```python
-frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"]
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "video", "path": frames_paths},
-            {"type": "text", "text": "What do you see in this video?"},
-        ],
-    },
-]
-
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-)
-print(processed_chat.keys())
-```
-
-
-## Multimodal conversational pipeline
-
-[`ImageTextToTextPipeline`] currently accepts images as inputs but we are planning to add support for video inputs in the future. The pipeline supports chat inputs in the same format as we have seen above. Apart from that, the pipeline will accept chats in OpenAI format. This format is supported exclusively within the pipeline to make inference easier and more accessible. 
-
-Here is how the OpenAI conversation format looks:
-
-```python
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What is in this image?",
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": f"http://images.cocodataset.org/val2017/000000039769.jpg"},
-            },
-        ],
-    }
-]
-```
-
-## Best Practices for Multimodal Template Configuration
-
-
-To add a custom chat template for your multimodal LLM, simply create your template using [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with `processor.chat_template`. If you're new to writing chat templates or need some tips, check out our [tutorial here](./chat_template_advanced) for helpful guidance.
-
-In some cases, you may want your template to handle a **list of content** from multiple modalities, while still supporting a plain string for text-only inference. Here's an example of how you can achieve that, using the [Llama-Vision](https://huggingface.co/collections/meta-llama/metas-llama-32-multimodal-models-675bfd70e574a62dd0e4059b) chat template.
-
-
-```
-{% for message in messages %}
-{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
-{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
-{% if message['content'] is string %}
-{{ message['content'] }}
-{% else %}
-{% for content in message['content'] %}
-{% if content['type'] == 'image' %}
-{{ '<|image|>' }}
-{% elif content['type'] == 'text' %}
-{{ content['text'] }}
-{% endif %}
-{% endfor %}
-{% endif %}
-{{ '<|eot_id|>' }}
-{% endfor %}
-{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
-```
--- a/docs/source/en/chat_template_tools_and_documents.md
+++ b/docs/source/en/chat_template_tools_and_documents.md
@ -1,410 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-
-# Expanding Chat Templates with Tools and Documents
-
-The only argument that `apply_chat_template` requires is `messages`. However, you can pass any keyword
-argument to `apply_chat_template` and it will be accessible inside the template. This gives you a lot of freedom to use
-chat templates for many things. There are no restrictions on the names or the format of these arguments - you can pass
-strings, lists, dicts or whatever else you want. 
-
-That said, there are some common use-cases for these extra arguments,
-such as passing tools for function calling, or documents for retrieval-augmented generation. In these common cases,
-we have some opinionated recommendations about what the names and formats of these arguments should be, which are
-described in the sections below. We encourage model authors to make their chat templates compatible with this format,
-to make it easy to transfer tool-calling code between models.
-
-## Tool use / function calling
-
-"Tool use" LLMs can choose to call functions as external tools before generating an answer. When passing tools
-to a tool-use model, you can simply pass a list of functions to the `tools` argument:
-
-```python
-import datetime
-
-def current_time():
-    """Get the current local time as a string."""
-    return str(datetime.now())
-
-def multiply(a: float, b: float):
-    """
-    A function that multiplies two numbers
-    
-    Args:
-        a: The first number to multiply
-        b: The second number to multiply
-    """
-    return a * b
-
-tools = [current_time, multiply]
-
-model_input = tokenizer.apply_chat_template(
-    messages,
-    tools=tools
-)
-```
-
-In order for this to work correctly, you should write your functions in the format above, so that they can be parsed
-correctly as tools. Specifically, you should follow these rules:
-
- The function should have a descriptive name
- Every argument must have a type hint
- The function must have a docstring in the standard Google style (in other words, an initial function description  
-  followed by an `Args:` block that describes the arguments, unless the function does not have any arguments.) 
- Do not include types in the `Args:` block. In other words, write `a: The first number to multiply`, not
-  `a (int): The first number to multiply`. Type hints should go in the function header instead.
- The function can have a return type and a `Returns:` block in the docstring. However, these are optional
-  because most tool-use models ignore them.
-
-### Passing tool results to the model
-
-The sample code above is enough to list the available tools for your model, but what happens if it wants to actually use
-one? If that happens, you should:
-
-1. Parse the model's output to get the tool name(s) and arguments.
-2. Add the model's tool call(s) to the conversation.
-3. Call the corresponding function(s) with those arguments.
-4. Add the result(s) to the conversation
-
-### A complete tool use example
-
-Let's walk through a tool use example, step by step. For this example, we will use an 8B `Hermes-2-Pro` model,
-as it is one of the highest-performing tool-use models in its size category at the time of writing. If you have the
-memory, you can consider using a larger model instead like [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
-or [Mixtral-8x22B](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1), both of which also support tool use
-and offer even stronger performance.
-
-First, let's load our model and tokenizer:
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
-
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
-```
-
-Next, let's define a list of tools:
-
-```python
-def get_current_temperature(location: str, unit: str) -> float:
-    """
-    Get the current temperature at a location.
-    
-    Args:
-        location: The location to get the temperature for, in the format "City, Country"
-        unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
-    Returns:
-        The current temperature at the specified location in the specified units, as a float.
-    """
-    return 22.  # A real function should probably actually get the temperature!
-
-def get_current_wind_speed(location: str) -> float:
-    """
-    Get the current wind speed in km/h at a given location.
-    
-    Args:
-        location: The location to get the temperature for, in the format "City, Country"
-    Returns:
-        The current wind speed at the given location in km/h, as a float.
-    """
-    return 6.  # A real function should probably actually get the wind speed!
-
-tools = [get_current_temperature, get_current_wind_speed]
-```
-
-Now, let's set up a conversation for our bot:
-
-```python
-messages = [
-  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
-  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
-]
-```
-
-Now, let's apply the chat template and generate a response:
-
-```python
-inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
-inputs = {k: v.to(model.device) for k, v in inputs.items()}
-out = model.generate(**inputs, max_new_tokens=128)
-print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
-```
-
-And we get:
-
-```text
-<tool_call>
-{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
-</tool_call><|im_end|>
-```
-
-The model has called the function with valid arguments, in the format requested by the function docstring. It has
-inferred that we're most likely referring to the Paris in France, and it remembered that, as the home of SI units,
-the temperature in France should certainly be displayed in Celsius.
-
-<Tip>
-
-The output format above is specific to the `Hermes-2-Pro` model we're using in this example. Other models may emit different
-tool call formats, and you may need to do some manual parsing at this step. For example, `Llama-3.1` models will emit
-slightly different JSON, with `parameters` instead of `arguments`. Regardless of the format the model outputs, you 
-should add the tool call to the conversation in the format below, with `tool_calls`, `function` and `arguments` keys. 
-
-</Tip>
-
-Next, let's append the model's tool call to the conversation.
-
-```python
-tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
-messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
-```
-
-<Tip warning={true}>
-
-If you're familiar with the OpenAI API, you should pay attention to an important difference here - the `tool_call` is
-a dict, but in the OpenAI API it's a JSON string. Passing a string may cause errors or strange model behaviour!
-
-</Tip>
-
-Now that we've added the tool call to the conversation, we can call the function and append the result to the
-conversation. Since we're just using a dummy function for this example that always returns 22.0, we can just append 
-that result directly.
-
-```python
-messages.append({"role": "tool", "name": "get_current_temperature", "content": "22.0"})
-```
-
-<Tip>
-
-Some model architectures, notably Mistral/Mixtral, also require a `tool_call_id` here, which should be
-9 randomly-generated alphanumeric characters, and assigned to the `id` key of the tool call
-dictionary. The same key should also be assigned to the `tool_call_id` key of the tool response dictionary below, so 
-that tool calls can be matched to tool responses. So, for Mistral/Mixtral models, the code above would be:
-
-```python
-tool_call_id = "9Ae3bDc2F"  # Random ID, 9 alphanumeric characters
-tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
-messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
-```
-
-and
-
-```python
-messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"})
-```
-
-</Tip>
-
-Finally, let's let the assistant read the function outputs and continue chatting with the user:
-
-```python
-inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
-inputs = {k: v.to(model.device) for k, v in inputs.items()}
-out = model.generate(**inputs, max_new_tokens=128)
-print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
-```
-
-And we get:
-
-```text
-The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|>
-```
-
-Although this was a simple demo with dummy tools and a single call, the same technique works with 
-multiple real tools and longer conversations. This can be a powerful way to extend the capabilities of conversational
-agents with real-time information, computational tools like calculators, or access to large databases.
-
-### Understanding tool schemas
-
-Each function you pass to the `tools` argument of `apply_chat_template` is converted into a 
-[JSON schema](https://json-schema.org/learn/getting-started-step-by-step). These schemas
-are then passed to the model chat template. In other words, tool-use models do not see your functions directly, and they
-never see the actual code inside them. What they care about is the function **definitions** and the **arguments** they
-need to pass to them - they care about what the tools do and how to use them, not how they work! It is up to you
-to read their outputs, detect if they have requested to use a tool, pass their arguments to the tool function, and
-return the response in the chat.
-
-Generating JSON schemas to pass to the template should be automatic and invisible as long as your functions
-follow the specification above, but if you encounter problems, or you simply want more control over the conversion, 
-you can handle the conversion manually. Here is an example of a manual schema conversion.
-
-```python
-from transformers.utils import get_json_schema
-
-def multiply(a: float, b: float):
-    """
-    A function that multiplies two numbers
-    
-    Args:
-        a: The first number to multiply
-        b: The second number to multiply
-    """
-    return a * b
-
-schema = get_json_schema(multiply)
-print(schema)
-```
-
-This will yield:
-
-```json
-{
-  "type": "function", 
-  "function": {
-    "name": "multiply", 
-    "description": "A function that multiplies two numbers", 
-    "parameters": {
-      "type": "object", 
-      "properties": {
-        "a": {
-          "type": "number", 
-          "description": "The first number to multiply"
-        }, 
-        "b": {
-          "type": "number",
-          "description": "The second number to multiply"
-        }
-      }, 
-      "required": ["a", "b"]
-    }
-  }
-}
-```
-
-If you wish, you can edit these schemas, or even write them from scratch yourself without using `get_json_schema` at 
-all. JSON schemas can be passed directly to the `tools` argument of 
-`apply_chat_template` - this gives you a lot of power to define precise schemas for more complex functions. Be careful,
-though - the more complex your schemas, the more likely the model is to get confused when dealing with them! We 
-recommend simple function signatures where possible, keeping arguments (and especially complex, nested arguments) 
-to a minimum.
-
-Here is an example of defining schemas by hand, and passing them directly to `apply_chat_template`:
-
-```python
-# A simple function that takes no arguments
-current_time = {
-  "type": "function", 
-  "function": {
-    "name": "current_time",
-    "description": "Get the current local time as a string.",
-    "parameters": {
-      'type': 'object',
-      'properties': {}
-    }
-  }
-}
-
-# A more complete function that takes two numerical arguments
-multiply = {
-  'type': 'function',
-  'function': {
-    'name': 'multiply',
-    'description': 'A function that multiplies two numbers', 
-    'parameters': {
-      'type': 'object', 
-      'properties': {
-        'a': {
-          'type': 'number',
-          'description': 'The first number to multiply'
-        }, 
-        'b': {
-          'type': 'number', 'description': 'The second number to multiply'
-        }
-      }, 
-      'required': ['a', 'b']
-    }
-  }
-}
-
-model_input = tokenizer.apply_chat_template(
-    messages,
-    tools = [current_time, multiply]
-)
-```
-
-## Retrieval-augmented generation
-
-"Retrieval-augmented generation" or "RAG" LLMs can search a corpus of documents for information before responding
-to a query. This allows models to vastly expand their knowledge base beyond their limited context size. Our 
-recommendation for RAG models is that their template
-should accept a `documents` argument. This should be a list of documents, where each "document"
-is a single dict with `title` and `contents` keys, both of which are strings. Because this format is much simpler
-than the JSON schemas used for tools, no helper functions are necessary.
-
-Here's an example of a RAG template in action:
-
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-# Load the model and tokenizer
-model_id = "CohereForAI/c4ai-command-r-v01-4bit"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
-device = model.device # Get the device the model is loaded on
-
-# Define conversation input
-conversation = [
-    {"role": "user", "content": "What has Man always dreamed of?"}
-]
-
-# Define documents for retrieval-based generation
-documents = [
-    {
-        "title": "The Moon: Our Age-Old Foe", 
-        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
-    },
-    {
-        "title": "The Sun: Our Age-Old Friend",
-        "text": "Although often underappreciated, the sun provides several notable benefits..."
-    }
-]
-
-# Tokenize conversation and documents using a RAG template, returning PyTorch tensors.
-input_ids = tokenizer.apply_chat_template(
-    conversation=conversation,
-    documents=documents,
-    chat_template="rag",
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt").to(device)
-
-# Generate a response 
-gen_tokens = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    )
-
-# Decode and print the generated text along with generation prompt
-gen_text = tokenizer.decode(gen_tokens[0])
-print(gen_text)
-```
-
-<Tip>
-
-The `documents` input for retrieval-augmented generation is not widely supported, and many models have chat templates which simply ignore this input.
-
-To verify if a model supports the `documents` input, you can read its model card, or `print(tokenizer.chat_template)` to see if the `documents` key is used anywhere.
-
-One model class that does support it, though, is Cohere's [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024), through their `rag` chat template. You can see additional examples of grounded generation using this feature in their model cards.
-
-</Tip>
-
-
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -0,0 +1,229 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Templates
+
+The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format.
+
+In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model.
+
+<hfoptions id="template">
+<hfoption id="Mistral">
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+tokenizer.apply_chat_template(chat, tokenize=False)
+```
+```md
+<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]
+```
+
+</hfoption>
+<hfoption id="Zephyr">
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+tokenizer.apply_chat_template(chat, tokenize=False)
+```
+```md
+<|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n
+```
+
+</hfoption>
+</hfoptions>
+
+This guide explores [`apply_chat_template`] and chat templates in more detail.
+
+## apply_chat_template
+
+Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it.
+
+Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
+
+messages = [
+    {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+print(tokenizer.decode(tokenized_chat[0]))
+```
+```md
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+```
+
+Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
+
+```py
+outputs = model.generate(tokenized_chat, max_new_tokens=128) 
+print(tokenizer.decode(outputs[0]))
+```
+```md
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
+```
+
+### add_generation_prompt
+The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message.
+
+Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
+
+```py
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+tokenized_chat
+```
+```md
+<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+```
+
+### continue_final_message
+
+The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message.
+
+This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies.
+
+```py
+chat = [
+    {"role": "user", "content": "Can you format the answer in JSON?"},
+    {"role": "assistant", "content": '{"name": "'},
+]
+
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
+model.generate(**formatted_chat)
+```
+
+> [!WARNING]
+> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
+
+[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline.
+
+## Multiple templates
+
+A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG.
+
+When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error.
+
+For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`.
+
+To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`.
+
+It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template.
+
+## Template selection
+
+It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant.
+
+But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
+
+```py
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+```
+
+Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with.
+
+```py
+tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+```
+
+The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`].
+
+```py
+<|im_start|>system
+You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
+<|im_start|>user
+How are you?<|im_end|>
+<|im_start|>assistant
+I'm doing great!<|im_end|>
+```
+
+## Model training
+
+Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
+
+An example of preprocessing a dataset with a chat template is shown below.
+
+```py
+from transformers import AutoTokenizer
+from datasets import Dataset
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+
+chat1 = [
+    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
+    {"role": "assistant", "content": "The sun."}
+]
+chat2 = [
+    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
+    {"role": "assistant", "content": "A bacterium."}
+]
+
+dataset = Dataset.from_dict({"chat": [chat1, chat2]})
+dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
+print(dataset['formatted_chat'][0])
+```
+```md
+<|user|>
+Which is bigger, the moon or the sun?</s>
+<|assistant|>
+The sun.</s>
+```
+
+After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column.
+
+Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them.
+
+```py
+apply_chat_template(messages, tokenize=False, add_special_tokens=False)
+```
+
+This isn’t an issue if `apply_chat_template(tokenize=True)`.
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -0,0 +1,243 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multimodal templates
+
+Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
+
+Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.
+
+This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
+
+## ImageTextToTextPipeline
+
+[`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
+
+Start by building a chat history with the following two roles.
+
+- `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models.
+- `user` is where you enter your first message to the model.
+
+```py
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "What are these?"},
+        ],
+    },
+]
+```
+
+Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
+
+> [!TIP]
+> The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible. 
+
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
+pipeline(text=messages, max_new_tokens=50, return_full_text=False)
+[{'input_text': [{'role': 'system',
+    'content': [{'type': 'text',
+      'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]},
+   {'role': 'user',
+    'content': [{'type': 'image',
+      'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'},
+     {'type': 'text', 'text': 'What are these?'}]}],
+  'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}]
+```
+
+## Image inputs
+
+For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below.
+
+- The content `"type"` can be an `"image"` or `"text"`.
+- For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model.
+
+```python
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+
+model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+
+messages = [
+    {
+      "role": "system",
+      "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "What are these?"},
+        ],
+    },
+]
+```
+
+Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`.
+
+```py
+processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
+print(processed_chat.keys())
+```
+
+These inputs are now ready to be used in [`~GenerationMixin.generate`].
+
+## Video inputs
+
+Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
+
+- The content `"type"` should be `"video"` to indicate the content is a video.
+- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
+
+> [!WARNING]
+> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
+
+```python
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+
+model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+messages = [
+    {
+      "role": "system",
+      "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"},
+            {"type": "text", "text": "What do you see in this video?"},
+        ],
+    },
+]
+```
+
+Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
+
+The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
+
+The examples below use Decord as the backend because it is a bit faster than PyAV.
+
+<hfoptions id="sampling">
+<hfoption id="fixed number of frames">
+
+The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling.
+
+
+```python
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    num_frames=32,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
+These inputs are now ready to be used in [`~GenerationMixin.generate`].
+
+</hfoption>
+<hfoption id="fps">
+
+For longer videos, it may be better to sample more frames for better representation with the `video_fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `video_fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds.
+
+```py
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    video_fps=32,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
+</hfoption>
+<hfoption id="list of image frames">
+
+Videos may also exist as a set of sampled frames stored as images rather than the full video file.
+
+In this case, pass a list of image file paths and the processor automatically concatenates them into a video. Make sure all images are the same size since they are assumed to be from the same video.
+
+```py
+frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"]
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "video", "path": frames_paths},
+            {"type": "text", "text": "What do you see in this video?"},
+        ],
+    },
+]
+
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+)
+print(processed_chat.keys())
+```
+
+</hfoption>
+</hfoptions>
+
+## Template configuration
+
+You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details.
+
+For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json).
+
+```jinja
+{% for message in messages %}
+{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+{% if message['content'] is string %}
+{{ message['content'] }}
+{% else %}
+{% for content in message['content'] %}
+{% if content['type'] == 'image' %}
+{{ '<|image|>' }}
+{% elif content['type'] == 'text' %}
+{{ content['text'] }}
+{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|eot_id|>' }}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
+```
--- a/docs/source/en/chat_templating_writing.md
+++ b/docs/source/en/chat_templating_writing.md
@ -0,0 +1,251 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Template writing
+
+A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles.
+
+1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.).
+2. Print the message followed by an end-of-sequence (`EOS`) token.
+3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response.
+
+An example template is shown below.
+
+```jinja
+{%- for message in messages %}
+    {{- '<|' + message['role'] + |>\n' }}
+    {{- message['content'] + eos_token }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|assistant|>\n' }}
+{%- endif %}
+```
+
+The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips.
+
+## Create a template
+
+Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages.
+
+```jinja
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+    {%- endif %}
+{%- endfor %}
+```
+
+Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used.
+
+```py
+template = tokenizer.chat_template
+template = template.replace("SYS", "SYSTEM")  # Change the system token
+tokenizer.chat_template = template  # Set the new template
+```
+
+The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model.
+
+```py
+tokenizer.push_to_hub("model_name")
+```
+
+## Template writing tips
+
+The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax.
+
+This section curates some best practices for writing clean and efficient Jinja templates.
+
+### Trimming whitespace
+
+Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block.
+
+```jinja
+{%- for message in messages %}
+    {{- message['role'] + message['content'] }}
+{%- endfor %}
+```
+
+The incorrect whitespace usage example below may introduce a newline and indentation in the output.
+
+```jinja
+{% for message in messages %}
+    {{ message['role'] + message['content'] }}
+{% endfor %}
+```
+
+### Special variables
+
+There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments.
+
+- `messages` contains the chat history as a list of message dicts.
+- `tools` contains a list of tools in JSON schema format.
+- `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models).
+- `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation.
+- `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`.
+
+### Callable functions
+
+There are two callable functions available inside a template.
+
+- `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage.
+- `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.
+
+### Compatibility with non-Python Jinja
+
+Jinja is implemented in multiple languages and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust.
+
+Make the changes below to ensure compatibility across all Jinja implementations.
+
+- Replace Python methods with Jinja filters. For example, replace `string.lower()` with `string|lower` or `dict.items()` with `dict|dictitems`. Most of the changes follow the same pattern except `string.strip()`, which is replaced with `string|trim`. Refer to the list of [built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) for a complete list of filters.
+- Replace `True`, `False`, and `None` (these are Python specific) with `true`, `false`, and `none` respectively.
+- Directly rendering a dict or list may return different results in other implementations. For example, string entries may change from single-quote to double-quote. To avoid this, add the [tojson](https://jinja.palletsprojects.com/en/3.1.x/templates/#jinja-filters.tojson) filter to maintain consistency.
+
+### Big templates
+
+Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
+
+Write the template in a separate file and extract it to the chat template.
+
+```py
+open("template.jinja", "w").write(tokenizer.chat_template)
+```
+
+You could also load an edited template back into the tokenizer.
+
+```py
+tokenizer.chat_template = open("template.jinja").read()
+```
+
+## Templates for tools
+
+There isn't a specific format for writing templates for tools but it is best to follow the standard API. This ensures the template is widely accessible across models without requiring users to write custom code to use tools with your model.
+
+> [!WARNING]
+> Formatting such as whitespace and special tokens are model-specific. Make sure everything exactly matches the format a model was trained with.
+
+The following section lists elements of the standard API for writing templates for tools.
+
+### Tool definitions
+
+Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas.
+
+The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers.
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with.
+
+```
+{%- if tools %}
+    {%- for tool in tools %}
+        {{- '<tool>' + tool['function']['name'] + '\n' }}
+        {%- for argument in tool['function']['parameters']['properties'] %}
+            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+        {%- endfor %}
+        {{- '\n</tool>' }}
+    {%- endif %}
+{%- endif %}
+```
+
+### Tool calls
+
+Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.
+
+```json
+{
+  "role": "assistant",
+  "tool_calls": [
+    {
+      "type": "function",
+      "function": {
+        "name": "multiply",
+        "arguments": {
+          "a": 5,
+          "b": 6
+        }
+      }
+    }
+  ]
+}
+```
+
+A common pattern for handling tool calls is shown below.
+
+```
+{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
+    {%- for tool_call in message['tool_calls'] %}
+            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+```
+
+### Tool responses
+
+Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys.
+
+```json
+{
+  "role": "tool",
+  "name": "multiply",
+  "content": "30"
+}
+```
+
+Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`.
+
+```
+{%- if message['role'] == 'tool' %}
+    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{%- endif %}
+```
+
+## Contribute
+
+Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`].
+
+Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template.
+
+```py
+tokenizer.chat_template = template
+tokenizer.push_to_hub("model_name")
+```
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -14,74 +14,84 @@ rendered properly in your Markdown viewer.

 -->

-# Chatting with Transformers
+# Chat basics

-If you're reading this article, you're almost certainly aware of **chat models**. Chat models are conversational
-AIs that you can send and receive messages with. The most famous of these is the proprietary ChatGPT, but there are
-now many open-source chat models which match or even substantially exceed its performance. These models are free to
-download and run on a local machine. Although the largest and most capable models require high-powered hardware
-and lots of memory to run, there are smaller models that will run perfectly well on a single consumer GPU, or even
-an ordinary desktop or notebook CPU. 
+Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter.

-This guide will help you get started with chat models. We'll start with a brief quickstart guide that uses a convenient,
-high-level "pipeline". This is all you need if you just want to start running a chat model 
-immediately. After the quickstart, we'll move on to more detailed information about
-what exactly chat models are, how to choose an appropriate one, and a low-level breakdown of each of the
-steps involved in talking to a chat model. We'll also give some tips on optimizing the performance and memory usage
-of your chat models.
+Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models.

+> [!TIP]
+> Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)!

-## Quickstart
+This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-If you have no time for details, here's the brief summary: Chat models continue chats. This means that you pass them
-a conversation history, which can be as short as a single user message, and the model will continue the conversation
-by adding its response. Let's see this in action. First, let's build a chat:
+## transformers CLI

-```python
+After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
+
+```bash
+transformers chat Qwen/Qwen2.5-0.5B-Instruct
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
+</div>
+
+You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
+
+```bash
+transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
+```
+
+For a full list of options, run the command below.
+
+```bash
+transformers chat -h
+```
+
+The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
+
+## TextGenerationPipeline
+
+[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
+
+To start, build a chat history with the following two roles.
+
+- `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models.
+- `user` is where you enter your first message to the model.
+
+```py
 chat = [
    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
 ]
 ```

-Notice that in addition to the user's message, we added a **system** message at the start of the conversation. Not all
-chat models support system messages, but when they do, they represent high-level directives about how the model
-should behave in the conversation. You can use this to guide the model - whether you want short or long responses,
-lighthearted or serious ones, and so on. If you want the model to do useful work instead of
-practicing its improv routine, you can either omit the system message or try a terse one such as "You are a helpful and intelligent
-AI assistant who responds to user queries."
+Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.

-Once you have a chat, the quickest way to continue it is using the [`TextGenerationPipeline`]. 
-Let's see this in action with `LLaMA-3`. Note that `LLaMA-3` is a gated model, which means you will need to 
-[apply for access](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and log in with your Hugging Face 
-account to use it. We'll also use `device_map="auto"`, which will load the model on GPU if there's enough memory
-for it, and set the dtype to `torch.bfloat16` to save memory:
-
-```python
+```py
 import torch
 from transformers import pipeline

-pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
-response = pipe(chat, max_new_tokens=512)
-print(response[0]['generated_text'][-1]['content'])
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
 ```

-And you'll get:
-
-```text
-(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright, 
+```txt
+(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
 some wild stuff, like that Warhol guy's soup cans and all that jazz.

-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@ -91,25 +101,21 @@ So, there you have it, pal! That's my expert advice on what to do in New York. N
 excuse me, I've got some oil changes to attend to. (winks)
 ```

-You can continue the chat by appending your own response to it. The
-`response` object returned by the pipeline actually contains the entire chat so far, so we can simply append
-a message and pass it back:
+Use the `append` method on `chat` to respond to the models message.

-```python
-chat = response[0]['generated_text']
+```py
+chat = response[0]["generated_text"]
 chat.append(
    {"role": "user", "content": "Wait, what's so wild about soup cans?"}
 )
-response = pipe(chat, max_new_tokens=512)
-print(response[0]['generated_text'][-1]['content'])
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
 ```

-And you'll get:
-
-```text
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
+```txt
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
 (sarcastically) Oh, yeah, real original, Andy.

 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
@ -120,171 +126,35 @@ But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (
 But, hey, that's what makes art, art, right? (laughs)
 ```

-The remainder of this tutorial will cover specific topics such
-as performance and memory, or how to select a chat model for your needs.
+## Performance

-## Choosing a chat model
+Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index).

-There are an enormous number of different chat models available on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending),
-and new users often feel very overwhelmed by the selection offered. Don't be, though! You really need to just focus on
-two important considerations: 
- The model's size, which will determine if you can fit it in memory and how quickly it will
-run.
- The quality of the model's chat output.
+> [!TIP]
+> Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available.

-In general, these are correlated - bigger models tend to be 
-more capable, but even so there's a lot of variation at a given size point!
+Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.

-### Size and model naming
-The size of a model is easy to spot - it's the number in the model name, like "8B" or "70B". This is the number of
-**parameters** in the model. Without quantization, you should expect to need about 2 bytes of memory per parameter.
-This means that an "8B" model with 8 billion parameters will need about 16GB of memory just to fit the parameters, 
-plus a little extra for other overhead. It's a good fit for a high-end consumer GPU with 24GB of memory, such as a 3090
-or 4090.
-
-Some chat models are "Mixture of Experts" models. These may list their sizes in different ways, such as "8x7B" or 
-"141B-A35B". The numbers are a little fuzzier here, but in general you can read this as saying that the model
-has approximately 56 (8x7) billion parameters in the first case, or 141 billion parameters in the second case.
-
-Note that it is very common to use quantization techniques to reduce the memory usage per parameter to 8 bits, 4 bits,
-or even less. This topic is discussed in more detail in the [Memory considerations](#memory-considerations) section below.
-
-### But which chat model is best?
-Even once you know the size of chat model you can run, there's still a lot of choice out there. One way to sift through
-it all is to consult **leaderboards**. Two of the most popular leaderboards are the [OpenLLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
-and the [LMSys Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard). Note that the LMSys leaderboard
-also includes proprietary models - look at the `licence` column to identify open-source ones that you can download, then
-search for them on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending).
-
-### Specialist domains
-Some models may be specialized for certain domains, such as medical or legal text, or non-English languages. 
-If you're working in these domains, you may find that a specialized model will give you big performance benefits. 
-Don't automatically assume that, though! Particularly when specialized models are smaller or older than the current 
-cutting-edge, a top-end general-purpose model may still outclass them. Thankfully, we are beginning to see 
-[domain-specific leaderboards](https://huggingface.co/blog/leaderboard-medicalllm) that should make it easier to locate
-the best models for specialized domains.
-
-## What happens inside the pipeline?
-
-The quickstart above used a high-level pipeline to chat with a chat model, which is convenient, but not the
-most flexible. Let's take a more low-level approach, to see each of the steps involved in chat. Let's start with
-a code sample, and then break it down:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-
-# Prepare the input as before
-chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
-]
-
-# 1: Load the model and tokenizer
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-
-# 2: Apply the chat template
-formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-print("Formatted chat:\n", formatted_chat)
-
-# 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
-inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
-# Move the tokenized inputs to the same device the model is on (GPU/CPU)
-inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
-print("Tokenized inputs:\n", inputs)
-
-# 4: Generate text from the model
-outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
-print("Generated tokens:\n", outputs)
-
-# 5: Decode the output back to a string
-decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
-print("Decoded output:\n", decoded_output)
-```
-
-There's a lot in here, each piece of which could be its own document! Rather than going into too much detail, I'll cover
-the broad ideas, and leave the details for the linked documents. The key steps are:
-
-1. [Models](https://huggingface.co/learn/nlp-course/en/chapter2/3) and [Tokenizers](https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt) are loaded from the Hugging Face Hub.
-2. The chat is formatted using the tokenizer's [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating)
-3. The formatted chat is [tokenized](https://huggingface.co/learn/nlp-course/en/chapter2/4) using the tokenizer.
-4. We [generate](https://huggingface.co/docs/transformers/en/llm_tutorial) a response from the model.
-5. The tokens output by the model are decoded back to a string
-
-## Performance, memory and hardware
-
-You probably know by now that most machine learning tasks are run on GPUs. However, it is entirely possible
-to generate text from a chat model or language model on a CPU, albeit somewhat more slowly. If you can fit
-the model in GPU memory, though, this will usually be the preferable option.
-
-### Memory considerations
-
-By default, Hugging Face classes like [`TextGenerationPipeline`] or [`AutoModelForCausalLM`] will load the model in 
-`float32` precision. This means that it will need 4 bytes (32 bits) per parameter, so an "8B" model with 8 billion
-parameters will need ~32GB of memory. However, this can be wasteful! Most modern language models are trained in 
-"bfloat16" precision, which uses only 2 bytes per parameter. If your hardware supports it (Nvidia 30xx/Axxx
-or newer), you can load the model in `bfloat16` precision, using the `torch_dtype` argument as we did above.
-
-It is possible to go even lower than 16-bits using "quantization", a method to lossily compress model weights. This
-allows each parameter to be squeezed down to 8 bits, 4 bits or even less. Note that, especially at 4 bits,
-the model's outputs may be negatively affected, but often this is a tradeoff worth making to fit a larger and more
-capable chat model in memory. Let's see this in action with `bitsandbytes`:
-
-```python
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", quantization_config=quantization_config)
-```
-
-Or we can do the same thing using the `pipeline` API:
-
-```python
+```py
 from transformers import pipeline, BitsAndBytesConfig

-quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit
-pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
 ```

-There are several other options for quantizing models besides `bitsandbytes` - please see the [Quantization guide](./quantization)
-for more information.
+In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token.

-### Performance considerations
+The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types.

-<Tip>
+| Hardware | Memory bandwidth |
+|---|---|
+| consumer CPU | 20-100GB/sec |
+| specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec |
+| data center GPU (NVIDIA A100/H100) | 2-3TB/sec |

-For a more extensive guide on language model performance and optimization, check out [LLM Inference Optimization](./llm_optims) .
+The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth.

-</Tip>
-
-
-As a general rule, larger chat models will be slower in addition to requiring more memory. It's possible to be
-more concrete about this, though: Generating text from a chat model is unusual in that it is bottlenecked by
-**memory bandwidth** rather than compute power, because every active parameter must be read from memory for each
-token that the model generates. This means that number of tokens per second you can generate from a chat
-model is generally proportional to the total bandwidth of the memory it resides in, divided by the size of the model.
-
-In our quickstart example above, our model was ~16GB in size when loaded in `bfloat16` precision. 
-This means that 16GB must be read from memory for every token generated by the model. Total memory bandwidth can
-vary from 20-100GB/sec for consumer CPUs to 200-900GB/sec for consumer GPUs, specialized CPUs like
-Intel Xeon, AMD Threadripper/Epyc or high-end Apple silicon, and finally up to 2-3TB/sec for data center GPUs like
-the Nvidia A100 or H100. This should give you a good idea of the generation speed you can expect from these different
-hardware types.
-
-Therefore, if you want to improve the speed of text generation, the easiest solution is to either reduce the
-size of the model in memory (usually by quantization), or get hardware with higher memory bandwidth. For advanced users, 
-several other techniques exist to get around this bandwidth bottleneck. The most common are variants on 
-[assisted generation](https://huggingface.co/blog/assisted-generation), also known as "speculative
-sampling". These techniques try to guess multiple future tokens at once, often using a smaller "draft model", and then
-confirm these generations with the chat model. If the guesses are validated by the chat model, more than one token can
-be generated per forward pass, which greatly alleviates the bandwidth bottleneck and improves generation speed.  
-
-Finally, we should also note the impact of "Mixture of Experts" (MoE) models here. Several popular chat models,
-such as Mixtral, Qwen-MoE and DBRX, are MoE models. In these models, not every parameter is active for every token generated.
-As a result, MoE models generally have much lower memory bandwidth requirements, even though their total size
-can be quite large. They can therefore be several times faster than a normal "dense" model of the same size. However,
-techniques like assisted generation are generally ineffective for these models because more parameters will become
-active with each new speculated token, which will negate the bandwidth and speed benefits that the MoE architecture
-provides.
+You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.

+> [!TIP]
+> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
--- a/docs/source/en/create_a_model.md
+++ b/docs/source/en/create_a_model.md
@ -1,472 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Create a custom architecture
-
-An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to:
-
- Load and customize a model configuration.
- Create a model architecture.
- Create a slow and fast tokenizer for text.
- Create an image processor for vision tasks.
- Create a feature extractor for audio tasks.
- Create a processor for multimodal tasks.
-
-## Configuration
-
-A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with.
-
-Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes:
-
-```py
->>> from transformers import DistilBertConfig
-
->>> config = DistilBertConfig()
->>> print(config)
-DistilBertConfig {
-  "activation": "gelu",
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to:
-
- Try a different activation function with the `activation` parameter.
- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter.
-
-```py
->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
->>> print(my_config)
-DistilBertConfig {
-  "activation": "relu",
-  "attention_dropout": 0.4,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
-```
-
-Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory:
-
-```py
->>> my_config.save_pretrained(save_directory="./your_model_save_path")
-```
-
-To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
-```
-
-<Tip>
-
-You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details.
-
-</Tip>
-
-## Model
-
-The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. This means models are compatible with each of their respective framework's usage.
-
-<frameworkcontent>
-<pt>
-Load your custom configuration attributes into the model:
-
-```py
->>> from transformers import DistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
->>> model = DistilBertModel(my_config)
-```
-
-This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
-
-Create a pretrained model with [`~PreTrainedModel.from_pretrained`]:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
-```
-</pt>
-<tf>
-Load your custom configuration attributes into the model:
-
-```py
->>> from transformers import TFDistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> tf_model = TFDistilBertModel(my_config)
-```
-
-This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
-
-Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
-```
-</tf>
-</frameworkcontent>
-
-### Model heads
-
-At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation).
-
-<frameworkcontent>
-<pt>
-For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
-
-```py
->>> from transformers import DistilBertForSequenceClassification
-
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
-
-```py
->>> from transformers import DistilBertForQuestionAnswering
-
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
-```
-</pt>
-<tf>
-For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
-
-```py
->>> from transformers import TFDistilBertForSequenceClassification
-
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
-
-```py
->>> from transformers import TFDistilBertForQuestionAnswering
-
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
-```
-</tf>
-</frameworkcontent>
-
-## Tokenizer
-
-The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers:
-
- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer.
- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to its Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters.
-
-Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens.
-
-<Tip warning={true}>
-
-Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support.
-
-</Tip>
-
-If you trained your own tokenizer, you can create one from your *vocabulary* file:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
-```
-
-It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
-
-```py
->>> from transformers import DistilBertTokenizerFast
-
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-<Tip>
-
-By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`.
-
-</Tip>
-
-## Image processor
-
-An image processor processes vision inputs. It inherits from the base [`~image_processing_utils.ImageProcessingMixin`] class.
-
-To use, create an image processor associated with the model you're using. For example, create a default [`ViTImageProcessor`] if you are using [ViT](model_doc/vit) for image classification:
-
-```py
->>> from transformers import ViTImageProcessor
-
->>> vit_extractor = ViTImageProcessor()
->>> print(vit_extractor)
-ViTImageProcessor {
-  "do_normalize": true,
-  "do_resize": true,
-  "image_processor_type": "ViTImageProcessor",
-  "image_mean": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": 2,
-  "size": 224
-}
-```
-
-<Tip>
-
-If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default image processor parameters.
-
-</Tip>
-
-Modify any of the [`ViTImageProcessor`] parameters to create your custom image processor:
-
-```py
->>> from transformers import ViTImageProcessor
-
->>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
->>> print(my_vit_extractor)
-ViTImageProcessor {
-  "do_normalize": false,
-  "do_resize": true,
-  "image_processor_type": "ViTImageProcessor",
-  "image_mean": [
-    0.3,
-    0.3,
-    0.3
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": "PIL.Image.BOX",
-  "size": 224
-}
-```
-
-## Backbone
-
-<div style="text-align: center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png">
-</div>
-
-Computer vision models consist of a backbone, neck, and head. The backbone extracts features from an input image, the neck combines and enhances the extracted features, and the head is used for the main task (e.g., object detection). Start by initializing a backbone in the model config and specify whether you want to load pretrained weights or load randomly initialized weights. Then you can pass the model config to the model head.
-
-For example, to load a [ResNet](../model_doc/resnet) backbone into a [MaskFormer](../model_doc/maskformer) model with an instance segmentation head:
-
-<hfoptions id="backbone">
-<hfoption id="pretrained weights">
-
-Set `use_pretrained_backbone=True` to load pretrained ResNet weights for the backbone.
-
-```py
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-</hfoption>
-<hfoption id="random weights">
-
-Set `use_pretrained_backbone=False` to randomly initialize a ResNet backbone.
-
-```py
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-You could also load the backbone config separately and then pass it to the model config.
-
-```py
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
-
-backbone_config = ResNetConfig()
-config = MaskFormerConfig(backbone_config=backbone_config)
-model = MaskFormerForInstanceSegmentation(config)
-```
-
-</hfoption>
-</hfoptions id="timm backbone">
-
-[timm](https://hf.co/docs/timm/index) models are loaded within a model with `use_timm_backbone=True` or with [`TimmBackbone`] and [`TimmBackboneConfig`].
-
-Use `use_timm_backbone=True` and `use_pretrained_backbone=True` to load pretrained timm weights for the backbone.
-
-```python
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-Set `use_timm_backbone=True` and `use_pretrained_backbone=False` to load a randomly initialized timm backbone.
-
-```python
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-You could also load the backbone config and use it to create a `TimmBackbone` or pass it to the model config. Timm backbones will load pretrained weights by default. Set `use_pretrained_backbone=False` to load randomly initialized weights.
-
-```python
-from transformers import TimmBackboneConfig, TimmBackbone
-
-backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=False)
-
-# Create a backbone class
-backbone = TimmBackbone(config=backbone_config)
-
-# Create a model with a timm backbone
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone_config=backbone_config)
-model = MaskFormerForInstanceSegmentation(config)
-```
-
-## Feature extractor
-
-A feature extractor processes audio inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`SequenceFeatureExtractor`] class for processing audio inputs.
-
-To use, create a feature extractor associated with the model you're using. For example, create a default [`Wav2Vec2FeatureExtractor`] if you are using [Wav2Vec2](model_doc/wav2vec2) for audio classification:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor()
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": true,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 16000
-}
-```
-
-<Tip>
-
-If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters.
-
-</Tip>
-
-Modify any of the [`Wav2Vec2FeatureExtractor`] parameters to create your custom feature extractor:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False)
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": false,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 8000
-}
-```
-
-## Processor
-
-For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer.
-
-Create a feature extractor to handle the audio inputs:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
-```
-
-Create a tokenizer to handle the text inputs:
-
-```py
->>> from transformers import Wav2Vec2CTCTokenizer
-
->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
-```
-
-Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]:
-
-```py
->>> from transformers import Wav2Vec2Processor
-
->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-```
-
-With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune.
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,45 +14,33 @@ rendered properly in your Markdown viewer.

 -->

-# Building custom models
+# Customizing models

-The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder
-of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs.
+Transformers models are designed to be customizable. A models code is fully contained in the [model](https://github.com/huggingface/transformers/tree/main/src/transformers/models) subfolder of the Transformers repository. Each folder contains a `modeling.py` and a `configuration.py` file. Copy these files to start customizing a model.

-If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you
-how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it
-with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗
-Transformers library. We'll see how to build upon transformers and extend the framework with your hooks and
-custom code.
+> [!TIP]
+> It may be easier to start from scratch if you're creating an entirely new model. But for models that are very similar to an existing one in Transformers, it is faster to reuse or subclass the same configuration and model class.

-We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the
-[timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`].
+This guide will show you how to customize a ResNet model, enable [AutoClass](./models#autoclass) support, and share it on the Hub.

-## Writing a custom configuration
+## Configuration

-Before we dive into the model, let's first write its configuration. The configuration of a model is an object that
-will contain all the necessary information to build the model. As we will see in the next section, the model can only
-take a `config` to be initialized, so we really need that object to be as complete as possible.
+A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the attributes of the custom ResNet model. Different attributes gives different ResNet model types.

-<Tip>
+The main rules for customizing a configuration are:

-Models in the `transformers` library itself generally follow the convention that they accept a `config` object
-in their `__init__` method, and then pass the whole `config` to sub-layers in the model, rather than breaking the 
-config object into multiple arguments that are all passed individually to sub-layers. Writing your model in this 
-style results in simpler code with a clear "source of truth" for any hyperparameters, and also makes it easier
-to reuse code from other models in `transformers`.
+1. A custom configuration must subclass [`PretrainedConfig`]. This ensures a custom model has all the functionality of a Transformers' model such as [`~PretrainedConfig.from_pretrained`], [`~PretrainedConfig.save_pretrained`], and [`~PretrainedConfig.push_to_hub`].
+2. The [`PretrainedConfig`] `__init__` must accept any `kwargs` and they must be passed to the superclass `__init__`. [`PretrainedConfig`] has more fields than the ones set in your custom configuration, so when you load a configuration with [`~PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass.

-</Tip>
+> [!TIP]
+> It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` belong to one of the predefined values.
+>
+> Add `model_type` to the configuration class to enable [AutoClass](./models#autoclass) support.

-In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different
-configurations will then give us the different types of ResNets that are possible. We then just store those arguments,
-after checking the validity of a few of them.
-
-```python
+```py
 from transformers import PretrainedConfig
 from typing import List

-
 class ResnetConfig(PretrainedConfig):
    model_type = "resnet"

@ -86,56 +74,38 @@ class ResnetConfig(PretrainedConfig):
        super().__init__(**kwargs)
 ```

-The three important things to remember when writing you own configuration are the following:
- you have to inherit from `PretrainedConfig`,
- the `__init__` of your `PretrainedConfig` must accept any kwargs,
- those `kwargs` need to be passed to the superclass `__init__`.
-
-The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other
-constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a
-config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the
-superclass.
-
-Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to
-register your model with the auto classes (see last section).
-
-With this done, you can easily create and save your configuration like you would do with any other model config of the
-library. Here is how we can create a resnet50d config and save it:
+Save the configuration to a JSON file in your custom model folder, `custom-resnet`, with [`~PretrainedConfig.save_pretrained`].

 ```py
 resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
 resnet50d_config.save_pretrained("custom-resnet")
 ```

-This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the
-`from_pretrained` method:
+## Model

-```py
-resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
-```
+With the custom ResNet configuration, you can now create and customize the model. The model subclasses the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers' functionalities such as saving and loading to the custom model.

-You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to
-directly upload your config to the Hub.
+Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the model sublayers, instead of breaking the `config` object into multiple arguments that are individually passed to the sublayers.

-## Writing a custom model
+Writing models this way produces simpler code with a clear source of truth for any hyperparameters. It also makes it easier to reuse code from other Transformers' models.

-Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that
-extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image
-classification (like [`BertForSequenceClassification`]).
+You'll create two ResNet models, a barebones ResNet model that outputs the hidden states and a ResNet model with an image classification head.

-As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only
-thing we need to do before writing this class is a map between the block types and actual block classes. Then the
-model is defined from the configuration by passing everything to the `ResNet` class:
+<hfoptions id="resnet">
+<hfoption id="ResnetModel">
+
+Define a mapping between the block types and classes. Everything else is created by passing the configuration class to the ResNet model class.
+
+> [!TIP]
+> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.

 ```py
 from transformers import PreTrainedModel
 from timm.models.resnet import BasicBlock, Bottleneck, ResNet
 from .configuration_resnet import ResnetConfig

-
 BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}

-
 class ResnetModel(PreTrainedModel):
    config_class = ResnetConfig

@ -158,12 +128,17 @@ class ResnetModel(PreTrainedModel):
        return self.model.forward_features(tensor)
 ```

-For the model that will classify images, we just change the forward method:
+</hfoption>
+<hfoption id="ResnetModelForImageClassification">
+
+The `forward` method needs to be rewritten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
+
+> [!TIP]
+> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.

 ```py
 import torch

-
 class ResnetModelForImageClassification(PreTrainedModel):
    config_class = ResnetConfig

@ -190,34 +165,20 @@ class ResnetModelForImageClassification(PreTrainedModel):
        return {"logits": logits}
 ```

-In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config`
-(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless
-you want to register your model with the auto classes (see last section).
+</hfoption>
+</hfoptions>

-<Tip>
+A model can return any output format. Returning a dictionary (like `ResnetModelForImageClassification`) with losses when labels are available makes the custom model compatible with [`Trainer`]. For other output formats, you'll need your own training loop or a different library for training.

-If your model is very similar to a model inside the library, you can re-use the same configuration as this model.
-
-</Tip>
-
-You can have your model return anything you want, but returning a dictionary like we did for
-`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly
-usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own
-training loop or another library for training.
-
-Now that we have our model class, let's create one:
+Instantiate the custom model class with the configuration.

 ```py
 resnet50d = ResnetModelForImageClassification(resnet50d_config)
 ```

-Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or
-[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights
-with the code of our model. But first, let's load some pretrained weights inside our model.
+At this point, you can load pretrained weights into the model or train it from scratch. In this guide, you'll load pretrained weights.

-In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial,
-we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be
-easy to transfer those weights:
+Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict).

 ```py
 import timm
@ -226,17 +187,14 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True)
 resnet50d.model.load_state_dict(pretrained_model.state_dict())
 ```

-Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the
-code of the model is saved.
+## AutoClass

-## Registering a model with custom code to the auto classes
+The [AutoClass](./models#model-classes) API is a shortcut for automatically loading the correct architecture for a given model. It is convenient to enable this for users loading your custom model.

-If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
-model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
-get the custom models (contrarily to automatically downloading the model code from the Hub).
+Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. Use the [`~AutoConfig.register`] method to add the custom configuration and model to the [AutoClass](./models#model-classes) API.

-As long as your config has a `model_type` attribute that is different from existing model types, and that your model
-classes have the right `config_class` attributes, you can just add them to the auto classes like this:
+> [!TIP]
+> The first argument to [`AutoConfig.register`] must match the `model_type` attribute in the custom configuration class, and the first argument to [`AutoModel.register`] must match the `config_class` of the custom model class.

 ```py
 from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
@ -246,25 +204,23 @@ AutoModel.register(ResnetConfig, ResnetModel)
 AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
 ```

-Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
-of your custom config, and the first argument used when registering your custom models to any auto model class needs
-to match the `config_class` of those models.
+Your custom model code is now compatible with the [AutoClass](./models#autoclass) API. Users can load the model with the [AutoModel](./model_doc/auto#automodel) or [`AutoModelForImageClassification`] classes.

-## Sending the code to the Hub
+## Upload

-<Tip warning={true}>
+Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it.

-This API is experimental and may have some slight breaking changes in the next releases.
+Ensure the model directory is structured correctly as shown below. The directory should contain:

-</Tip>
+- `modeling.py`: Contains the code for `ResnetModel` and `ResnetModelForImageClassification`. This file can rely on relative imports to other files as long as they're in the same directory.

-First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as
-long as all the files are in the same directory (we don't support submodules for this feature yet). For our example,
-we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working
-directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file
-contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
+> [!WARNING]
+> When copying a Transformers' model file, replace all relative imports at the top of the `modeling.py` file to import from Transformers instead.

-```
+- `configuration.py`: Contains the code for `ResnetConfig`.
+- `__init__.py`: Can be empty, this file allows Python `resnet_model` to be used as a module.
+
+```bash
 .
 └── resnet_model
    ├── __init__.py
@ -272,27 +228,16 @@ contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
    └── modeling_resnet.py
 ```

-The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module.
-
-<Tip warning={true}>
-
-If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file
-to import from the `transformers` package.
-
-</Tip>
-
-Note that you can re-use (or subclass) an existing configuration/model.
-
-To share your model with the community, follow those steps: first import the ResNet model and config from the newly
-created files:
+To share the model, import the ResNet model and configuration.

 ```py
 from resnet_model.configuration_resnet import ResnetConfig
 from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
 ```

-Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained`
-method and properly register them with a given Auto class (especially for models), just run:
+Copy the code from the model and configuration files. To make sure the AutoClass objects are saved with [`~PreTrainedModel.save_pretrained`], call the [`~PretrainedConfig.register_for_auto_class`] method. This modifies the configuration JSON file to include the AutoClass objects and mapping.
+
+For a model, pick the appropriate `AutoModelFor` class based on the task.

 ```py
 ResnetConfig.register_for_auto_class()
@ -300,27 +245,17 @@ ResnetModel.register_for_auto_class("AutoModel")
 ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
 ```

-Note that there is no need to specify an auto class for the configuration (there is only one auto class for them,
-[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you
-have to specify which one of the auto classes is the correct one for your model.
-
-<Tip>
-
-Use `register_for_auto_class()` if you want the code files to be copied. If you instead prefer to use code on the Hub from another repo, 
-you don't need to call it. In cases where there's more than one auto class, you can modify the `config.json` directly using the 
-following structure:
+To map more than one task to the model, edit `auto_map` in the configuration JSON file directly.

 ```json
-"auto_map": {     
-	"AutoConfig": "<your-repo-name>--<config-name>",     
-	"AutoModel": "<your-repo-name>--<config-name>",
-	"AutoModelFor<Task>": "<your-repo-name>--<config-name>",    
+"auto_map": {
+    "AutoConfig": "<your-repo-name>--<config-name>",
+    "AutoModel": "<your-repo-name>--<config-name>",
+    "AutoModelFor<Task>": "<your-repo-name>--<config-name>",    
 },
 ```

-</Tip>
-
-Next, let's create the config and models as we did before:
+Create the configuration and model and load pretrained weights into it.

 ```py
 resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
@ -330,13 +265,17 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True)
 resnet50d.model.load_state_dict(pretrained_model.state_dict())
 ```

-Now to send the model to the Hub, make sure you are logged in. Either run in your terminal:
+The model is ready to be pushed to the Hub now. Log in to your Hugging Face account from the command line or notebook.
+
+<hfoptions id="push">
+<hfoption id="huggingface-CLI">

 ```bash
 huggingface-cli login
 ```

-or from a notebook:
+</hfoption>
+<hfoption id="notebook">

 ```py
 from huggingface_hub import notebook_login
@ -344,41 +283,15 @@ from huggingface_hub import notebook_login
 notebook_login()
 ```

-You can then push to your own namespace (or an organization you are a member of) like this:
+</hfoption>
+</hfoptions>
+
+Call [`~PreTrainedModel.push_to_hub`] on the model to upload the model to the Hub.

 ```py
 resnet50d.push_to_hub("custom-resnet50d")
 ```

-On top of the modeling weights and the configuration in json format, this also copied the modeling and
-configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result
-in this [model repo](https://huggingface.co/sgugger/custom-resnet50d).
-
-See the [sharing tutorial](model_sharing) for more information on the push to Hub method.
-
-## Using a model with custom code
-
-You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and
-the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still 
-review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use
-a model with custom code:
-
-```py
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
-```
-
-It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not
-update the code with some malicious new lines (unless you fully trust the authors of the models).
-
-```py
-commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
-model = AutoModelForImageClassification.from_pretrained(
-    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
-)
-```
-
-Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit
-hash of any commit.
+The pretrained weights, configuration, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now in a [repository](https://hf.co/sgugger/custom-resnet50d) under your namespace.

+Because a custom model doesn't use the same modeling code as a Transformers' model, you need to add `trust_remode_code=True` in [`~PreTrainedModel.from_pretrained`] to load it. Refer to the load [custom models](./models#custom-models) section for more information.
--- a/docs/source/en/debugging.md
+++ b/docs/source/en/debugging.md
@ -1,4 +1,4 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,55 +14,52 @@ rendered properly in your Markdown viewer.

 -->

-# Debugging
+# Multi-GPU debugging

-Training on multiple GPUs can be a tricky endeavor whether you're running into installation issues or communication problems between your GPUs. This debugging guide covers some issues you may run into and how to resolve them.
+Distributed training can be tricky because you have to ensure you're using the correct CUDA version across your system. You may encounter inter-communication issues between GPUs, and there may be underflow or overflow problems in your model.

-## DeepSpeed CUDA installation
+This guide covers how to debug these issues, especially as it relates to DeepSpeed and PyTorch.

-If you're using DeepSpeed, you've probably already installed it with the following command.
+## DeepSpeed CUDA
+
+DeepSpeed compiles CUDA C++ which can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system. This section focuses on PyTorch built with *CUDA 10.2*

 ```bash
 pip install deepspeed
 ```

-DeepSpeed compiles CUDA C++ code and it can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system, and this section focuses on PyTorch built with *CUDA 10.2*.
+> [!TIP]
+> For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team.

-<Tip>
+### Non-identical toolkits

-For any other installation issues, please [open an issue](https://github.com/deepspeedai/DeepSpeed/issues) with the DeepSpeed team.
+PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere.

-</Tip>
-
-### Non-identical CUDA toolkits
-
-PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed system-wide. If you don't have CUDA installed system-wide, you should install it first.
-
-The exact location may vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly setup and added to your `PATH` environment variable, you can find the installation location with the following command:
+The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.

 ```bash
 which nvcc
 ```

-### Multiple CUDA toolkits
+### Multiple toolkits

-You may also have more than one CUDA toolkit installed system-wide.
+You may also have more than one CUDA toolkit installed on your system.

 ```bash
 /usr/local/cuda-10.2
 /usr/local/cuda-11.0
 ```

-Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed system-wide already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path.
+Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path.

-Take a look at the contents of these environment variables first:
+Take a look at the contents of the following environment variables first.

 ```bash
 echo $PATH
 echo $LD_LIBRARY_PATH
 ```

-`PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To tell the build program where to find the specific CUDA toolkit you want, insert the correct path to list first. This command prepends rather than overwrites the existing values.
+`PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To find a specific CUDA toolkit, insert the correct path to list first. This command prepends rather than overwrites the existing values.

 ```bash
 # adjust the version and full path if needed
@ -70,23 +67,23 @@ export PATH=/usr/local/cuda-10.2/bin:$PATH
 export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
 ```

-In addition, you should also check the directories you assign actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`) and while it is unlikely your system names them differently, you should check the actual names and change them accordingly.
+In addition, you should also check that the assigned directories actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`), and while it is unlikely your system names them differently, you should check the actual names and change them accordingly.

-### Older CUDA versions
+### Older versions

 Sometimes, older CUDA versions may refuse to build with newer compilers. For example, if you have `gcc-9` but CUDA wants `gcc-7`. Usually, installing the latest CUDA toolkit enables support for the newer compiler.

-You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, you can create a symlink to give the build system visibility to the older compiler.
+You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, create a symlink to give the build system visibility to the older compiler.

 ```bash
-# adapt the path to your system
+# adjust the path to your system
 sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
 sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
 ```

 ### Prebuild

-If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, you can try to prebuild the DeepSpeed modules before installing them. To make a local build for DeepSpeed:
+If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, try to prebuild the DeepSpeed modules before installing them. Run the commands below to make a local build for DeepSpeed.

 ```bash
 git clone https://github.com/deepspeedai/DeepSpeed/
@ -97,19 +94,16 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
 --disable-pip-version-check 2>&1 | tee build.log
 ```

-<Tip>
+> [!TIP]
+> Add the `DS_BUILD_AIO=1` parameter to the build command to use NVMe offload. Make sure you install the libaio-dev package across your system.

-To use NVMe offload, add the `DS_BUILD_AIO=1` parameter to the build command and make sure you install the libaio-dev package system-wide.
-
-</Tip>
-
-Next, you'll have to specify your GPU's architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command:
+Next, specify your GPUs architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command.

 ```bash
 python -c "import torch; print(torch.cuda.get_arch_list())"
 ```

-Find the architecture for a GPU with the following command:
+Find the architecture for a GPU with the following command.

 <hfoptions id="arch">
 <hfoption id="same GPUs">
@ -121,7 +115,7 @@ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capa
 </hfoption>
 <hfoption id="specific GPU">

-To find the architecture for GPU `0`:
+Run the following command to find the architecture for GPU `0`. The results will show a value for `major` and `minor`, which is your GPU architecture. The GPU architecture below is `8.6`.

 ```bash
 CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
@ -129,8 +123,6 @@ print(torch.cuda.get_device_properties(torch.device('cuda')))
 "_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)"
 ```

-This means your GPU architecture is `8.6`.
-
 </hfoption>
 </hfoptions>

@ -138,7 +130,7 @@ If you get `8, 6`, then you can set `TORCH_CUDA_ARCH_LIST="8.6"`. For multiple G

 It is also possible to not specify `TORCH_CUDA_ARCH_LIST` and the build program automatically queries the GPU architecture of the build. However, it may or may not match the actual GPU on the target machine which is why it is better to explicitly specify the correct architecture.

-For training on multiple machines with the same setup, you'll need to make a binary wheel:
+For training on multiple machines with the same setup, you'll need to make a binary wheel as shown below.

 ```bash
 git clone https://github.com/deepspeedai/DeepSpeed/
@ -148,88 +140,64 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
 python setup.py build_ext -j8 bdist_wheel
 ```

-This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Now you can install this wheel locally or on another machine.
+This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Install this wheel locally or on another machine.

 ```bash
 pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl
 ```

-## Multi-GPU Network Issues Debug
+## Communication

-When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues.
+Distributed training involves communication between processes and or nodes and this can be a potential source of errors.
+
+Download the script below to diagnose network issues, and then run it to test GPU communication. The example command below tests how two GPUs communicate. Adjust the `--nproc_per_node` and `--nnodes` parameters to adapt it to your system.

 ```bash
 wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
-```
-
-For example to test how 2 GPUs interact do:
-
-```bash
 python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 ```
-If both processes can talk to each and allocate GPU memory each will print an OK status.

-For more GPUs or nodes adjust the arguments in the script.
+The script prints an `OK` status if both GPUs are able to communicate and allocate memory. Take a closer look at the diagnostic script for more details and a recipe for running it in a SLURM environment.

-You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment.
-
-An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows:
+Add the `NCCL_DEBUG=INFO` environment variable to report more NCCL-related debugging information.

 ```bash
 NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 ```

-This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue.
+## Underflow and overflow detection

+Underflow and overflow can occur when activations or weights are `inf`, `nan`, and when `loss=NaN`. This may indicate an underflow or overflow issue. To detect these issues, activate the `DebugUnderflowOverflow` module in [`TrainingArguments.debug`] or import and add the module to your own training loop or another trainer class.

+<hfoptions id="overflow">
+<hfoption id="Trainer">

-## Underflow and Overflow Detection
+```py
+from transformers import TrainingArguments

-<Tip>
-
-This feature is currently available for PyTorch-only.
-
-</Tip>
-
-<Tip>
-
-For multi-GPU training it requires DDP (`torch.distributed.launch`).
-
-</Tip>
-
-<Tip>
-
-This feature can be used with any `nn.Module`-based model.
-
-</Tip>
-
-If you start getting `loss=NaN` or the model exhibits some other abnormal behavior due to `inf` or `nan` in
-activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
-you can accomplish that easily by activating a special module that will do the detection automatically.
-
-If you're using [`Trainer`], you just need to add:
-
-```bash
--debug underflow_overflow
+args = TrainingArguments(
+    debug="underflow_overflow",
+    ...
+)
 ```

-to the normal command line arguments, or pass `debug="underflow_overflow"` when creating the
-[`TrainingArguments`] object.
+</hfoption>
+<hfoption id="PyTorch training loop">

-If you're using your own training loop or another Trainer you can accomplish the same with:
-
-```python
+```py
 from transformers.debug_utils import DebugUnderflowOverflow

 debug_overflow = DebugUnderflowOverflow(model)
 ```

-[`~debug_utils.DebugUnderflowOverflow`] inserts hooks into the model that immediately after each
-forward call will test input and output variables and also the corresponding module's weights. As soon as `inf` or
-`nan` is detected in at least one element of the activations or weights, the program will assert and print a report
-like this (this was caught with `google/mt5-small` under fp16 mixed precision):
+</hfoption>
+</hfoptions>

-```
+The [`~debug_utils.DebugUnderflowOverflow`] module inserts hooks into the model to test the input and output variables and the corresponding model weights after each forward call. If `inf` or `nan` is detected in at least one element of the activations or weights, the module prints a report like the one shown below.
+
+The example below is for fp16 mixed precision training with [google/mt5-small](https://huggingface.co/google/mt5-small).
+
+```shell
 Detected inf/nan during batch_number=0
 Last 21 forward frames:
 abs min  abs max  metadata
@ -269,48 +237,20 @@ abs min  abs max  metadata
 0.00e+00      inf output
 ```

-The example output has been trimmed in the middle for brevity.
+At the start of the report, you can see which batch number the error occurred. In this case, it occurred on the first batch.

-The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
-the inputs and outputs were in the range of `1e4`. So when this training was done under fp16 mixed precision the very
-last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under
-`fp16` the activations must remain way below `1e4`, because `1e4 * 1e4 = 1e8` so any matrix multiplication with
-large activations is going to lead to a numerical overflow condition.
+Each frame describes the module it is reporting on. For example, the frame below inspected `encoder.block.2.layer.1.layer_norm`. This indicates the layer norm in the first layer of the second block of the encoder. The forward calls are to `T5LayerNorm`.

-At the very start of the trace you can discover at which batch number the problem occurred (here `Detected inf/nan during batch_number=0` means the problem occurred on the first batch).
-
-Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
-for. If we look just at this frame:
-
-```
+```shell
                  encoder.block.2.layer.1.layer_norm T5LayerNorm
 8.69e-02 4.18e-01 weight
 2.65e-04 3.42e+03 input[0]
 1.79e-06 4.65e+00 output
 ```

-Here, `encoder.block.2.layer.1.layer_norm` indicates that it was a layer norm for the first layer, of the second
-block of the encoder. And the specific calls of the `forward` is `T5LayerNorm`.
+The last frame reports on the `Dropout.forward` function. It called the `dropout` attribute from inside the `DenseReluDense` class. You can observe that the overflow (`inf`) occurred in the first layer of the encoders second block in the first batch. The absolute largest input element was 6.27e+04.

-Let's look at the last few frames of that report:
-
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-[...]
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
+```shell
                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
 1.79e-06 4.65e+00 input[0]
 3.18e-04 6.27e+04 output
@ -319,22 +259,11 @@ abs min  abs max  metadata
 0.00e+00      inf output
 ```

-The last frame reports for `Dropout.forward` function with the first entry for the only input and the second for the
-only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see
-that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
-input elements was `6.27e+04` and same for the output was `inf`.
+The `T5DenseGatedGeluDense.forward` function output activations had an absolute maximum value of 6.27e+04 which is close to fp16s maximum limit of 6.4e+04. In the next step, `Dropout` renormalizes the weights, after zeroing some elements, which pushes the absolute maximum value to greater than 6.4e+04 resulting in an overflow.

-You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
-around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes
-the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
-overflow (`inf`).
+Now that you know where the error is happening, you can investigate the modeling code in [modeling_t5.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py).

-As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
-numbers.
-
-Let's match the report to the code from `models/t5/modeling_t5.py`:
-
-```python
+```py
 class T5DenseGatedGeluDense(nn.Module):
    def __init__(self, config):
        super().__init__()
@ -353,29 +282,11 @@ class T5DenseGatedGeluDense(nn.Module):
        return hidden_states
 ```

-Now it's easy to see the `dropout` call, and all the previous calls as well.
-
-Since the detection is happening in a forward hook, these reports are printed immediately after each `forward`
-returns.
-
-Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
-started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied
-or summed up. Of course, there might be other solutions. For example, we could turn off `amp` temporarily if it's
-enabled, after moving the original `forward` into a helper wrapper, like so:
-
-```python
-def _forward(self, hidden_states):
-    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-    hidden_linear = self.wi_1(hidden_states)
-    hidden_states = hidden_gelu * hidden_linear
-    hidden_states = self.dropout(hidden_states)
-    hidden_states = self.wo(hidden_states)
-    return hidden_states
-
+One solution is to go back a few steps before the values started growing too large and switch to fp32 so the numbers don't overflow when multiplied or summed. Another potential solution is to temporarily disable mixed precision training (`amp`).

+```py
 import torch

-
 def forward(self, hidden_states):
    if torch.is_autocast_enabled():
        with torch.cuda.amp.autocast(enabled=False):
@ -384,14 +295,11 @@ def forward(self, hidden_states):
        return self._forward(hidden_states)
 ```

-Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
-want to analyse the intermediary stages of any specific `forward` function as well. In such a case you can use the
-`detect_overflow` helper function to inject the detector where you want it, for example:
+The report only returns inputs and outputs of full frames, so you may also want to analyze the intermediate values of any `forward` function as well. Add the `detect_overflow` function after the forward calls to track `inf` or `nan` values in the intermediate `forwarded_states`.

-```python
+```py
 from debug_utils import detect_overflow

-
 class T5LayerFF(nn.Module):
    [...]

@ -403,40 +311,25 @@ class T5LayerFF(nn.Module):
        return hidden_states + self.dropout(forwarded_states)
 ```

-You can see that we added 2 of these and now we track if `inf` or `nan` for `forwarded_states` was detected
-somewhere in between.
+Finally, you can configure the number of frames printed by [`~debug_utils.DebugUnderflowOverflow`].

-Actually, the detector already reports these because each of the calls in the example above is a `nn.Module`, but
-let's say if you had some local direct calculations this is how you'd do that.
-
-Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
-its default, e.g.:
-
-```python
+```py
 from transformers.debug_utils import DebugUnderflowOverflow

 debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
 ```

-### Specific batch absolute min and max value tracing
+### Batch tracing

-The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
+[`~debug_utils.DebugUnderflowOverflow`] is able to trace the absolute minimum and maximum values in each batch with the underflow and overflow feature disabled. This is useful for identifying where errors are occurring in the model.

-Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given
-batch, and only do that for batches 1 and 3. Then you instantiate this class as:
+The example below shows how to trace the minimum and maximum values in batches 1 and 3 (batches are zero-indexd).

-```python
+```py
 debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
 ```

-And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
-
-Batches are 0-indexed.
-
-This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
-right to that area. Here is a sample truncated output for such configuration:
-
-```
+```shell
                  *** Starting batch number=1 ***
 abs min  abs max  metadata
                  shared Embedding
@ -465,13 +358,10 @@ abs min  abs max  metadata
 [...]
 ```

-Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
-not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
-a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
-numbers started to diverge.
+[`~debug_utils.DebugUnderflowOverflow`] reports on a large number of frames which is easier for debugging. Once you know where a problem is occurring, say batch 150, then you can focus the trace for batches 149 and 150 and compare where the numbers are diverging.

-You can also specify the batch number after which to stop the training, with:
+It is also possible to abort the trace after a certain batch number, for example, batch 3.

-```python
+```py
 debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
 ```
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
--- a/docs/source/en/executorch.md
+++ b/docs/source/en/executorch.md
@ -0,0 +1,59 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ExecuTorch
+
+[ExecuTorch](https://pytorch.org/executorch/stable/index.html) is a platform that enables PyTorch training and inference programs to be run on mobile and edge devices. It is powered by [torch.compile](https://pytorch.org/docs/stable/torch.compiler.html) and [torch.export](https://pytorch.org/docs/main/export.html) for performance and deployment.
+
+You can use ExecuTorch with Transformers with [torch.export](https://pytorch.org/docs/main/export.html). The [`~transformers.convert_and_export_with_cache`] method converts a [`PreTrainedModel`] into an exportable module. Under the hood, it uses [torch.export](https://pytorch.org/docs/main/export.html) to export the model, ensuring compatibility with ExecuTorch.
+
+```py
+import torch
+from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig
+from transformers.integrations.executorch import(
+    TorchExportableModuleWithStaticCache,
+    convert_and_export_with_cache
+)
+
+generation_config = GenerationConfig(
+    use_cache=True,
+    cache_implementation="static",
+    cache_config={
+        "batch_size": 1,
+        "max_cache_len": 20,
+    }
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
+model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
+
+exported_program = convert_and_export_with_cache(model)
+```
+
+The exported PyTorch model is now ready to be used with ExecuTorch. Wrap the model with [`~transformers.TorchExportableModuleWithStaticCache`] to generate text.
+
+```py
+prompts = ["Simply put, the theory of relativity states that "]
+prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+prompt_token_ids = prompt_tokens["input_ids"]
+
+generated_ids = TorchExportableModuleWithStaticCache.generate(
+    exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=20,
+)
+generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text)
+['Simply put, the theory of relativity states that 1) the speed of light is the']
+```
--- a/docs/source/en/fast_tokenizers.md
+++ b/docs/source/en/fast_tokenizers.md
@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,61 +14,349 @@ rendered properly in your Markdown viewer.

 -->

-# Use tokenizers from 🤗 Tokenizers
+# Tokenizers

-The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
-loaded very simply into 🤗 Transformers.
+Tokenizers convert text into an array of numbers known as tensors, the inputs to a text model. There are several tokenizer algorithms, but they all share the same purpose. Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids). A Transformers tokenizer also returns an attention mask to indicate which tokens should be attended to.

-Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
+> [!TIP]
+> Learn about the most popular tokenization algorithms on the [Summary of the tokenizers](./tokenizer_summary) doc.

-```python
->>> from tokenizers import Tokenizer
->>> from tokenizers.models import BPE
->>> from tokenizers.trainers import BpeTrainer
->>> from tokenizers.pre_tokenizers import Whitespace
+Call [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) or a local directory. The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files.

->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+Pass a string of text to the tokenizer to return the input ids and attention mask, and set the framework tensor type to return with the `return_tensors` parameter.

->>> tokenizer.pre_tokenizer = Whitespace()
->>> files = [...]
->>> tokenizer.train(files, trainer)
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
 ```

-We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
-a JSON file for future re-use.
+Whichever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary. This is especially important if you're using a custom tokenizer with a different vocabulary from the pretrained models tokenizer.

-## Loading directly from the tokenizer object
+This guide provides a brief overview of the tokenizer classes and how to preprocess text with it.

-Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
-[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
-*tokenizer* object as an argument:
+## Tokenizer classes

-```python
->>> from transformers import PreTrainedTokenizerFast
+All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. There are two main tokenizer classes that build on top of the base class.

->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+- [`PreTrainedTokenizer`] is a Python implementation, for example [`LlamaTokenizer`].
+- [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library, for example [`LlamaTokenizerFast`].
+
+There are two ways you can load a tokenizer, with [`AutoTokenizer`] or a model-specific tokenizer.
+
+<hfoptions id="tokenizer-classes">
+<hfoption id="AutoTokenizer">
+
+The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, [`AutoTokenizer`] tries to load a fast tokenizer if it's available, otherwise, it loads the Python implementation.
+
+Use [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
 ```

-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
+Load your own tokenizer by passing its vocabulary file to [`~AutoTokenizer.from_pretrained`].

-## Loading from a JSON file
+```py
+from transformers import AutoTokenizer

-In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
-
-```python
->>> tokenizer.save("tokenizer.json")
+tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt")
 ```

-The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
-method using the `tokenizer_file` parameter:
+</hfoption>
+<hfoption id="model-specific tokenizer">

-```python
->>> from transformers import PreTrainedTokenizerFast
+Each pretrained model is associated with a tokenizer and the specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class.

->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+> [!TIP]
+> Refer to a models API documentation to check whether a fast tokenizer is supported.
+
+```py
+from transformers import GemmaTokenizer
+
+tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
 ```

-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
+To load a fast tokenizer, use the fast implementation class.
+
+```py
+from transformers import GemmaTokenizerFast
+
+tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+```
+
+Load your own tokenizer by passing its vocabulary file to the `vocab_file` parameter.
+
+```py
+from transformers import GemmaTokenizerFast
+
+tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt")
+```
+
+</hfoption>
+</hfoptions>
+
+## Multimodal tokenizers
+
+In addition to text tokens, multimodal tokenizers also holds tokens from other modalities as a part of its attributes for easy access. 
+
+To add these special tokens to a tokenizer, pass them as a dictionary to the `extra_special_tokens` parameter in [`~AutoTokenizer.from_pretrained`]. The example below adds the `image_token` to a vision-language model.
+
+Save the tokenizer so you can reuse it with direct access to the `image_token`, `boi_token`, and `eoi_token`.
+
+```py
+vision_tokenizer = AutoTokenizer.from_pretrained(
+    "llava-hf/llava-1.5-7b-hf",
+    extra_special_tokens={"image_token": "<image>", "boi_token": "<image_start>", "eoi_token": "<image_end>"}
+)
+print(vision_tokenizer.image_token, vision_tokenizer.image_token_id)
+("<image>", 32000)
+
+vision_tokenizer.save_pretrained("./path/to/tokenizer")
+```
+
+## Fast tokenizers
+
+<Youtube id="3umI3tm27Vw"/>
+
+[`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers.
+
+[`AutoTokenizer`] automatically loads a fast tokenizer if it's supported. Otherwise, you need to explicitly load the fast tokenizer.
+
+This section will show you how to train a fast tokenizer and reuse it in Transformers.
+
+To train a Byte-Pair Encoding (BPE) tokenizer, create a [`~tokenizers.Tokenizer`] and [`~tokenizers.trainers.BpeTrainer`] class and define the unknown token and special tokens.
+
+```py
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+
+tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+```
+
+Split the tokens on [`~tokenizers.pre_tokenizers.Whitespace`] to create tokens that don't overlap with each other.
+
+```py
+from tokenizers.pre_tokenizers import Whitespace
+
+tokenizer.pre_tokenizer = Whitespace()
+```
+
+Call [`~tokenizers.Tokenizer.train`] on the text files and trainer to start training.
+
+```py
+files = [...]
+tokenizer.train(files, trainer)
+```
+
+Use [`~tokenizers.Tokenizer.save`] to save the tokenizers configuration and vocabulary to a JSON file.
+
+```py
+tokenizer.save("tokenizer.json")
+```
+
+Now you can load and reuse the tokenizer object in Transformers by passing it to the `tokenizer_object` parameter in [`PreTrainedTokenizerFast`].
+
+```py
+from transformers import PreTrainedTokenizerFast
+
+fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+To load a saved tokenizer from its JSON file, pass the file path to the `tokenizer_file` parameter in [`PreTrainedTokenizerFast`].
+
+```py
+from transformers import PreTrainedTokenizerFast
+
+fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+## tiktoken
+
+[tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI. It includes several tokenization schemes or encodings for how text should be tokenized.
+
+There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports models with a [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`].
+
+Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` tiktoken file is located.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original") 
+```
+
+### Create a tiktoken tokenizer
+
+The tiktoken `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json` (the appropriate format for [`PreTrainedTokenizerFast`]).
+
+Generate the tiktoken `tokenizer.model` file with the [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) function, and convert it to `tokenizer.json` with [convert_tiktoken_to_fast](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/integrations/tiktoken.py#L8).
+
+```py
+from transformers.integrations.tiktoken import convert_tiktoken_to_fast
+from tiktoken import get_encoding
+
+# Load your custom encoding or the one provided by OpenAI
+encoding = get_encoding("gpt2")
+convert_tiktoken_to_fast(encoding, "config/save/dir")
+```
+
+The resulting `tokenizer.json` file is saved to the specified directory and loaded with [`~PreTrainedTokenizerFast.from_pretrained`].
+
+```py
+tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
+```
+
+## Preprocess
+
+<Youtube id="Yffk5aydLzg"/>
+
+A Transformers model expects the input to be a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
+```
+
+The tokenization process of converting text into input ids is completed in two steps.
+
+<hfoptions id="steps">
+<hfoption id="1. tokenize">
+
+In the first step, a string of text is split into tokens by the [`~PreTrainedTokenizer.tokenize`] function. How the text is split depends on the tokenization algorithm.
+
+```py
+tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library")
+print(tokens)
+['We', '▁are', '▁very', '▁happy', '▁to', '▁show', '▁you', '▁the', '▁🤗', '▁Transformers', '▁library']
+```
+
+Gemma uses a [SentencePiece](./tokenizer_summary#sentencepiece) tokenizer which replaces spaces with an underscore `_`.
+
+</hfoption>
+<hfoption id="2. convert tokens to ids">
+
+In the second step, the tokens are converted into ids with [`~PreTrainedTokenizer.convert_tokens_to_ids`].
+
+```py
+ids = tokenizer.convert_tokens_to_ids(tokens)
+print(ids)
+[1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+```
+
+</hfoption>
+<hfoption id="3. decode ids to text">
+
+Lastly, the model prediction typically generates numerical outputs which are converted back to text with [`~PreTrainedTokenizer.decode`].
+
+```py
+decoded_string = tokenizer.decode(ids)
+print(decoded_string)
+'We are very happy to show you the 🤗 Transformers library'
+```
+
+</hfoption>
+</hfoptions>
+
+> [!TIP]
+> Visualize how different tokenizers work in the [Tokenizer Playground](https://xenova-the-tokenizer-playground.static.hf.space).
+
+### Special tokens
+
+Special tokens provide the model with some additional information about the text.
+
+For example, if you compare the tokens obtained from passing text directly to the tokenizer and from [`~PreTrainedTokenizer.convert_tokens_to_ids`], you'll notice some additional tokens are added.
+
+```py
+model_inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+[2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+tokenizer.convert_tokens_to_ids(tokens)
+[1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+```
+
+When you [`~PreTrainedTokenizer.decode`] the ids, you'll see `<bos>` at the beginning of the string. This is used to indicate the beginning of a sentence to the model.
+
+```py
+print(tokenizer.decode(model_inputs["input_ids"]))
+print(tokenizer.decode(ids))
+'<bos>We are very happy to show you the 🤗 Transformers library.'
+'We are very happy to show you the 🤗 Transformers library'
+```
+
+Not all models need special tokens, but if they do, a tokenizer automatically adds them.
+
+### Batch tokenization
+
+It is faster and more efficient to preprocess *batches* of text instead of a single sentence at a time. Fast tokenizers are especially good at parallelizing tokenization.
+
+Pass a list of string text to the tokenizer.
+
+```py
+batch_sentences = [
+    "But what about second breakfast?",
+    "Don't think he knows about second breakfast, Pip.",
+    "What about elevensies?",
+]
+encoded_inputs = tokenizer(batch_sentences, return_tensors="pt")
+print(encoded_inputs)
+{
+ 'input_ids': 
+    [[2, 1860, 1212, 1105, 2257, 14457, 235336], 
+     [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265], 
+     [2, 1841, 1105, 29754, 37453, 235336]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1]]
+}
+```
+
+### Padding
+
+> [!TIP]
+> Learn about additional padding strategies in the [Padding and truncation](./pad_truncation) guide.
+
+In the output above, the `input_ids` have different lengths. This is an issue because Transformers expects them to have the same lengths so it can pack them into a batch. Sequences with uneven lengths can't be batched.
+
+Padding adds a special *padding token* to ensure all sequences have the same length. Set `padding=True` to pad the sequences to the longest sequence length in the batch.
+
+```py
+encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
+print(encoded_inputs)
+```
+
+The tokenizer added the special padding token `0` to the left side (*left padding*) because Gemma and LLMs in general are not trained to continue generation from a padding token.
+
+### Truncation
+
+> [!TIP]
+> Learn about additional truncation strategies in the [Padding and truncation](./pad_truncation) guide.
+
+Models are only able to process sequences up to a certain length. If you try to process a sequence longer than a model can handle, it crashes.
+
+Truncation removes tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the maximum length accepted by the model. You can also set the maximum length yourself with the `max_length` parameter.
+
+```py
+encoded_inputs = tokenizer(batch_sentences, max_length=8, truncation=True, return_tensors="pt")
+print(encoded_inputs)
+```
--- a/docs/source/en/feature_extractors.md
+++ b/docs/source/en/feature_extractors.md
@ -0,0 +1,199 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Feature extractors
+
+Feature extractors preprocess audio data into the correct format for a given model. It takes the raw audio signal and converts it into a tensor that can be fed to a model. The tensor shape depends on the model, but the feature extractor will correctly preprocess the audio data for you given the model you're using. Feature extractors also include methods for padding, truncation, and resampling.
+
+Call [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models) or local directory. The feature extractor and preprocessor configuration is saved in a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json) file.
+
+Pass the audio signal, typically stored in `array`, to the feature extractor and set the `sampling_rate` parameter to the pretrained audio models sampling rate. It is important the sampling rate of the audio data matches the sampling rate of the data a pretrained audio model was trained on.
+
+```py
+from transformers import AutoFeatureExtractor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
+processed_sample
+{'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
+       -2.8888427e-03,  9.4472744e-05,  9.4472744e-05], dtype=float32)]}
+```
+
+The feature extractor returns an input, `input_values`, that is ready for the model to consume.
+
+This guide walks you through the feature extractor classes and how to preprocess audio data.
+
+## Feature extractor classes
+
+Transformers feature extractors inherit from the base [`SequenceFeatureExtractor`] class which subclasses [`FeatureExtractionMixin`].
+
+- [`SequenceFeatureExtractor`] provides a method to [`~SequenceFeatureExtractor.pad`] sequences to a certain length to avoid uneven sequence lengths.
+- [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor.
+
+There are two ways you can load a feature extractor, [`AutoFeatureExtractor`] and a model-specific feature extractor class.
+
+<hfoptions id="feature-extractor-classes">
+<hfoption id="AutoFeatureExtractor">
+
+The [AutoClass](./model_doc/auto) API automatically loads the correct feature extractor for a given model.
+
+Use [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor.
+
+```py
+from transformers import AutoFeatureExtractor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny")
+```
+
+</hfoption>
+<hfoption id="model-specific feature extractor">
+
+Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json).
+
+A feature extractor can be loaded directly from its model-specific class.
+
+```py
+from transformers import WhisperFeatureExtractor
+
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+```
+
+</hfoption>
+</hfoptions>
+
+## Preprocess
+
+A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using.
+
+For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` to be a tensor of shape `(batch_size, feature_size, sequence_length)` but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` to be a tensor of shape `(batch_size, sequence_length)`.
+
+The feature extractor generates the correct input shape for whichever audio model you're using.
+
+A feature extractor also sets the sampling rate (the number of audio signal values taken per second) of the audio files. The sampling rate of your audio data must match the sampling rate of the dataset a pretrained model was trained on. This value is typically given in the model card.
+
+Load a dataset and feature extractor with [`~FeatureExtractionMixin.from_pretrained`].
+
+```py
+from datasets import load_dataset, Audio
+from transformers import AutoFeatureExtractor
+
+dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Check out the first example from the dataset and access the `audio` column which contains `array`, the raw audio signal.
+
+```py
+dataset[0]["audio"]["array"]
+array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+        0.        ,  0.        ])
+```
+
+The feature extractor preprocesses `array` into the expected input format for a given audio model. Use the `sampling_rate` parameter to set the appropriate sampling rate.
+
+```py
+processed_dataset = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
+processed_dataset
+{'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
+       -2.8888427e-03,  9.4472744e-05,  9.4472744e-05], dtype=float32)]}
+```
+
+### Padding
+
+Audio sequence lengths that are different is an issue because Transformers expects all sequences to have the same lengths so they can be batched. Uneven sequence lengths can't be batched.
+
+```py
+dataset[0]["audio"]["array"].shape
+(86699,)
+
+dataset[1]["audio"]["array"].shape
+(53248,)
+```
+
+Padding adds a special *padding token* to ensure all sequences have the same length. The feature extractor adds a `0` - interpreted as silence - to `array` to pad it. Set `padding=True` to pad sequences to the longest sequence length in the batch.
+
+```py
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=16000,
+        padding=True,
+    )
+    return inputs
+
+processed_dataset = preprocess_function(dataset[:5])
+processed_dataset["input_values"][0].shape
+(86699,)
+
+processed_dataset["input_values"][1].shape
+(86699,)
+```
+
+### Truncation
+
+Models can only process sequences up to a certain length before crashing.
+
+Truncation is a strategy for removing excess tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the length in the `max_length` parameter.
+
+```py
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=16000,
+        max_length=50000,
+        truncation=True,
+    )
+    return inputs
+
+processed_dataset = preprocess_function(dataset[:5])
+processed_dataset["input_values"][0].shape
+(50000,)
+
+processed_dataset["input_values"][1].shape
+(50000,)
+```
+
+### Resampling
+
+The [Datasets](https://hf.co/docs/datasets/index) library can also resample audio data to match an audio models expected sampling rate. This method resamples the audio data on the fly when they're loaded which can be faster than resampling the entire dataset in-place.
+
+The audio dataset you've been working on has a sampling rate of 8kHz and the pretrained model expects 16kHz.
+
+```py
+dataset[0]["audio"]
+{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ]),
+ 'sampling_rate': 8000}
+```
+
+Call [`~datasets.Dataset.cast_column`] on the `audio` column to upsample the sampling rate to 16kHz.
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+When you load the dataset sample, it is now resampled to 16kHz.
+
+```py
+dataset[0]["audio"]
+{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'array': array([ 1.70562416e-05,  2.18727451e-04,  2.28099874e-04, ...,
+         3.43842403e-05, -5.96364771e-06, -1.76846661e-05]),
+ 'sampling_rate': 16000}
+```
--- a/docs/source/en/fsdp.md
+++ b/docs/source/en/fsdp.md
@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,81 +14,86 @@ rendered properly in your Markdown viewer.

 -->

-# Fully Sharded Data Parallel
+# FullyShardedDataParallel

-[Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) is a data parallel method that shards a model's parameters, gradients and optimizer states across the number of available GPUs (also called workers or *rank*). Unlike [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html), FSDP reduces memory-usage because a model is replicated on each GPU. This improves GPU memory-efficiency and allows you to train much larger models on fewer GPUs. FSDP is integrated with the Accelerate, a library for easily managing training in distributed environments, which means it is available for use from the [`Trainer`] class.
+[Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) is a [parallelism](./perf_train_gpu_many) method that combines the advantages of data and model parallelism for distributed training.

-Before you start, make sure Accelerate is installed and at least PyTorch 2.1.0 or newer.
+Unlike [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel), FSDP saves more memory because it doesn't replicate a model on each GPU. It shards the models parameters, gradients and optimizer states across GPUs. Each model shard processes a portion of the data and the results are synchronized to speed up training.
+
+This guide covers how to set up training a model with FSDP and [Accelerate](https://hf.co/docs/accelerate/index), a library for managing distributed training.

 ```bash
 pip install accelerate
 ```

-## FSDP configuration
+## Configuration options

-To start, run the [`accelerate config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to create a configuration file for your training environment. Accelerate uses this configuration file to automatically setup the correct training environment based on your selected training options in `accelerate config`.
+Always start by running the [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config) command to help Accelerate set up the correct distributed training environment.

 ```bash
 accelerate config
 ```

-When you run `accelerate config`, you'll be prompted with a series of options to configure your training environment. This section covers some of the most important FSDP options. To learn more about the other available FSDP options, take a look at the [fsdp_config](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.fsdp_config) parameters.
+The section below discusses some of the more important FSDP configuration options. Learn more about other available options in the [fsdp_config](https://hf.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.fsdp_config) parameter.

 ### Sharding strategy

-FSDP offers a number of sharding strategies to select from:
+FSDP offers several sharding strategies to distribute a model. Refer to the table below to help you choose the best strategy for your setup. Specify a strategy with the `fsdp_sharding_strategy` parameter in the configuration file.

-* `FULL_SHARD` - shards model parameters, gradients and optimizer states across workers; select `1` for this option
-* `SHARD_GRAD_OP`- shard gradients and optimizer states across workers; select `2` for this option
-* `NO_SHARD` - don't shard anything (this is equivalent to DDP); select `3` for this option
-* `HYBRID_SHARD` - shard model parameters, gradients and optimizer states within each worker where each worker also has a full copy; select `4` for this option
-* `HYBRID_SHARD_ZERO2` - shard gradients and optimizer states within each worker where each worker also has a full copy; select `5` for this option
-
-This is enabled by the `fsdp_sharding_strategy` flag.
+| sharding strategy | description | parameter value |
+|---|---|---|
+| `FULL_SHARD` | shards model parameters, gradients, and optimizer states | `1` |
+| `SHARD_GRAD_OP` | shards gradients and optimizer states | `2` |
+| `NO_SHARD` | don't shard the model | `3` |
+| `HYBRID_SHARD` | shards model parameters, gradients, and optimizer states within each GPU | `4` |
+| `HYBRID_SHARD_ZERO2` | shards gradients and optimizer states within each GPU | `5` |

 ### CPU offload

-You could also offload parameters and gradients when they are not in use to the CPU to save even more GPU memory and help you fit large models where even FSDP may not be sufficient. This is enabled by setting `fsdp_offload_params: true` when running `accelerate config`.
+Offload model parameters and gradients when they aren't being used to the CPU to save additional GPU memory. This is useful for scenarios where a model is too large even with FSDP.
+
+Specify `fsdp_offload_params: true` in the configuration file to enable offloading.

 ### Wrapping policy

-FSDP is applied by wrapping each layer in the network. The wrapping is usually applied in a nested way where the full weights are discarded after each forward pass to save memory for use in the next layer. The *auto wrapping* policy is the simplest way to implement this and you don't need to change any code. You should select `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP` to wrap a Transformer layer and `fsdp_transformer_layer_cls_to_wrap` to specify which layer to wrap (for example `BertLayer`).
+FSDP is applied by wrapping each layer in the network. The wrapping is usually applied in a nested way where the full weights are discarded after each forward pass to save memory for the next layer.

-Otherwise, you can choose a size-based wrapping policy where FSDP is applied to a layer if it exceeds a certain number of parameters. This is enabled by setting `fsdp_wrap_policy: SIZE_BASED_WRAP` and `min_num_param` to the desired size threshold.
+There are several wrapping policies available, but the *auto wrapping* policy is the simplest and doesn't require any changes to your code. Specify `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP` to wrap a Transformer layer and `fsdp_transformer_layer_cls_to_wrap` to determine which layer to wrap (for example, `BertLayer`).

-### Checkpointing
+Size-based wrapping is also available. If a layer exceeds a certain number of parameters, it is wrapped. Specify `fsdp_wrap_policy: SIZED_BASED_WRAP` and `min_num_param` to set the minimum number of parameters for a layer to be wrapped.

-Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method.
+### Checkpoints
+
+Intermediate checkpoints should be saved as a sharded state dict because saving the full state dict - even with CPU offloading - is time consuming and can cause `NCCL Timeout` errors due to indefinite hanging during broadcasting.
+
+Specify `fsdp_state_dict_type: SHARDED_STATE_DICT` in the configuration file to save the sharded state dict. Now you can resume training from the sharded state dict with [`~accelerate.Accelerator.load_state`].

 ```py
-# directory containing checkpoints
-accelerator.load_state("ckpt")
+accelerator.load_state("directory/containing/checkpoints")
 ```

-However, when training ends, you want to save the full state dict because sharded state dict is only compatible with FSDP.
+Once training is complete though, you should save the full state dict because the sharded state dict is only compatible with FSDP.

 ```py
 if trainer.is_fsdp_enabled:
-    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+  trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")

 trainer.save_model(script_args.output_dir)
 ```

 ### TPU

-[PyTorch XLA](https://pytorch.org/xla/release/2.1/index.html) supports FSDP training for TPUs and it can be enabled by modifying the FSDP configuration file generated by `accelerate config`. In addition to the sharding strategies and wrapping options specified above, you can add the parameters shown below to the file.
+[PyTorch XLA](https://pytorch.org/xla/release/2.1/index.html), a package for running PyTorch on XLA devices, enables FSDP on TPUs. Modify the configuration file to include the parameters below. Refer to the [xla_fsdp_settings](https://github.com/pytorch/xla/blob/2e6e183e0724818f137c8135b34ef273dea33318/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py#L128) parameter for additional XLA-specific parameters you can configure for FSDP.

 ```yaml
 xla: True # must be set to True to enable PyTorch/XLA
-xla_fsdp_settings: # XLA-specific FSDP parameters
-xla_fsdp_grad_ckpt: True # use gradient checkpointing
+xla_fsdp_settings: # XLA specific FSDP parameters
+xla_fsdp_grad_ckpt: True # enable gradient checkpointing
 ```

-The [`xla_fsdp_settings`](https://github.com/pytorch/xla/blob/2e6e183e0724818f137c8135b34ef273dea33318/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py#L128) allow you to configure additional XLA-specific parameters for FSDP.
+## Training

-## Launch training
-
-An example FSDP configuration file may look like:
+After running [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config), your configuration file should be ready. An example configuration file is shown below that fully shards the parameter, gradient and optimizer states on two GPUs. Your file may look different depending on how you set up your configuration.

 ```yaml
 compute_environment: LOCAL_MACHINE
@ -119,20 +124,22 @@ tpu_use_sudo: false
 use_cpu: false
 ```

-To launch training, run the [`accelerate launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) command and it'll automatically use the configuration file you previously created with `accelerate config`.
+Run the [accelerate launch](https://hf.co/docs/accelerate/package_reference/cli#accelerate-launch) command to launch a training script with the FSDP configurations you chose in the configuration file.

 ```bash
-accelerate launch my-trainer-script.py
+accelerate launch my-training-script.py
 ```

+It is also possible to directly specify some of the FSDP arguments in the command line.
+
 ```bash
-accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/ my-trainer-script.py
+accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/" my-training-script.py
 ```

-## Next steps
+## Resources

-FSDP can be a powerful tool for training really large models and you have access to more than one GPU or TPU. By sharding the model parameters, optimizer and gradient states, and even offloading them to the CPU when they're inactive, FSDP can reduce the high cost of large-scale training. If you're interested in learning more, the following may be helpful:
+FSDP is a powerful tool for training large models with fewer GPUs compared to other parallelism strategies. Refer to the following resources below to learn even more about FSDP.

-* Follow along with the more in-depth Accelerate guide for [FSDP](https://huggingface.co/docs/accelerate/usage_guides/fsdp).
-* Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post.
-* Read the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) blog post.
+- Follow along with the more in-depth Accelerate guide for [FSDP](https://hf.co/docs/accelerate/usage_guides/fsdp).
+- Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post.
+- Read the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) blog post.
--- a/Show More
+++ b/Show More