Release: v4.41.2

Do not trigger autoconversion if local_files_only (#31004 )
Release: v4.41.1
2025-10-21 09:44:02 +08:00 · 2024-05-24 05:03:17 -04:00 · 2024-05-24 05:02:39 -04:00 · 2024-05-22 13:40:40 -04:00 · 2024-05-22 13:39:52 -04:00 · 2024-05-22 12:39:27 -04:00
1189 changed files with 29909 additions and 20007 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -12,7 +12,7 @@ jobs:
    # Ensure running with CircleCI/huggingface
    check_circleci_user:
        docker:
-            - image: cimg/python:3.8.12
+            - image: python:3.10-slim
        parallelism: 1
        steps:
            - run: echo $CIRCLE_PROJECT_USERNAME
@ -26,13 +26,11 @@ jobs:
    fetch_tests:
        working_directory: ~/transformers
        docker:
-            - image: cimg/python:3.8.12
+            - image: huggingface/transformers-quality
        parallelism: 1
        steps:
            - checkout
-            - run: pip install --upgrade --upgrade-strategy eager pip
-            - run: pip install -U --upgrade-strategy eager GitPython
-            - run: pip install -U --upgrade-strategy eager .
+            - run: uv pip install -U -e .
            - run: mkdir -p test_preparation
            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
            - store_artifacts:
@ -88,25 +86,22 @@ jobs:
                      echo "No tests to run, exiting early!"
                      circleci-agent step halt
                  fi
-            - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt
            - store_artifacts:
-                  path: test_preparation/generated_config.txt
+                path: test_preparation/generated_config.yml
            - store_artifacts:
-                  path: test_preparation/filtered_test_list_cross_tests.txt
+                path: test_preparation/filtered_test_list_cross_tests.txt
            - continuation/continue:
-                  configuration_path: test_preparation/generated_config.yml
+                configuration_path: test_preparation/generated_config.yml

    # To run all tests for the nightly build
    fetch_all_tests:
        working_directory: ~/transformers
        docker:
-            - image: cimg/python:3.8.12
+            - image: huggingface/transformers-consistency
        parallelism: 1
        steps:
            - checkout
-            - run: pip install --upgrade --upgrade-strategy eager pip
-            - run: pip install -U --upgrade-strategy eager GitPython
-            - run: pip install -U --upgrade-strategy eager .
+            - run: uv pip install -e .
            - run: |
                  mkdir test_preparation
                  echo -n "tests" > test_preparation/test_list.txt
@ -126,7 +121,7 @@ jobs:
    check_code_quality:
        working_directory: ~/transformers
        docker:
-            - image: cimg/python:3.8.12
+            - image: huggingface/transformers-quality
        resource_class: large
        environment:
            TRANSFORMERS_IS_CI: yes
@ -134,24 +129,7 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - restore_cache:
-                  keys:
-                      - v0.7-code_quality-pip-{{ checksum "setup.py" }}
-                      - v0.7-code-quality-pip
-            - restore_cache:
-                  keys:
-                      - v0.7-code_quality-site-packages-{{ checksum "setup.py" }}
-                      - v0.7-code-quality-site-packages
-            - run: pip install --upgrade --upgrade-strategy eager pip
-            - run: pip install -U --upgrade-strategy eager .[all,quality]
-            - save_cache:
-                  key: v0.7-code_quality-pip-{{ checksum "setup.py" }}
-                  paths:
-                      - '~/.cache/pip'
-            - save_cache:
-                  key: v0.7-code_quality-site-packages-{{ checksum "setup.py" }}
-                  paths:
-                      - '~/.pyenv/versions/'
+            - run: uv pip install -e .
            - run:
                name: Show installed libraries and their versions
                command: pip freeze | tee installed.txt
@ -167,7 +145,7 @@ jobs:
    check_repository_consistency:
        working_directory: ~/transformers
        docker:
-            - image: cimg/python:3.8.12
+            - image: huggingface/transformers-consistency
        resource_class: large
        environment:
            TRANSFORMERS_IS_CI: yes
@ -175,24 +153,7 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - restore_cache:
-                  keys:
-                      - v0.7-repository_consistency-pip-{{ checksum "setup.py" }}
-                      - v0.7-repository_consistency-pip
-            - restore_cache:
-                  keys:
-                      - v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }}
-                      - v0.7-repository_consistency-site-packages
-            - run: pip install --upgrade --upgrade-strategy eager pip
-            - run: pip install -U --upgrade-strategy eager .[all,quality]
-            - save_cache:
-                  key: v0.7-repository_consistency-pip-{{ checksum "setup.py" }}
-                  paths:
-                      - '~/.cache/pip'
-            - save_cache:
-                  key: v0.7-repository_consistency-site-packages-{{ checksum "setup.py" }}
-                  paths:
-                      - '~/.pyenv/versions/'
+            - run: uv pip install -e .
            - run:
                name: Show installed libraries and their versions
                command: pip freeze | tee installed.txt
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -19,7 +19,7 @@ import os
 import random
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
-
+import glob
 import yaml


@ -41,7 +41,6 @@ class EmptyJob:

    def to_dict(self):
        return {
-            "working_directory": "~/transformers",
            "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE),
            "steps":["checkout"],
        }
@ -61,7 +60,6 @@ class CircleCIJob:
    pytest_options: Dict[str, Any] = None
    resource_class: Optional[str] = "2xlarge"
    tests_to_run: Optional[List[str]] = None
-    working_directory: str = "~/transformers"
    # This should be only used for doctest job!
    command_timeout: Optional[int] = None

@ -92,7 +90,6 @@ class CircleCIJob:
            cache_branch_prefix = "pull"

        job = {
-            "working_directory": self.working_directory,
            "docker": self.docker_image,
            "environment": env,
        }
@ -102,34 +99,14 @@ class CircleCIJob:
            job["parallelism"] = self.parallelism
        steps = [
            "checkout",
-            {"attach_workspace": {"at": "~/transformers/test_preparation"}},
-            {
-                "restore_cache": {
-                    "keys": [
-                        # check the fully-matched cache first
-                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}',
-                        # try the partially-matched cache from `main`
-                        f"v{self.cache_version}-{self.cache_name}-main-pip-",
-                        # try the general partially-matched cache
-                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-",
-                    ]
-                }
-            },
-            {
-                "restore_cache": {
-                    "keys": [
-                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}',
-                        f"v{self.cache_version}-{self.cache_name}-main-site-packages-",
-                        f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-",
-                    ]
-                }
-            },
+            {"attach_workspace": {"at": "test_preparation"}},
        ]
        steps.extend([{"run": l} for l in self.install_steps])
-        steps.extend([{"run": 'pip install "fsspec>=2023.5.0,<2023.10.0"'}])
-        steps.extend([{"run": "pip install pytest-subtests"}])
-        steps.append({"run": {"name": "Show installed libraries and their versions", "command": "pip freeze | tee installed.txt"}})
-        steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
+        steps.append({"run": {"name": "Show installed libraries and their size", "command": """du -h -d 1 "$(pip -V | cut -d ' ' -f 4 | sed 's/pip//g')" | grep -vE "dist-info|_distutils_hack|__pycache__" | sort -h | tee installed.txt || true"""}})
+        steps.append({"run": {"name": "Show installed libraries and their versions", "command": """pip list --format=freeze | tee installed.txt || true"""}})
+
+        steps.append({"run":{"name":"Show biggest libraries","command":"""dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""}})
+        steps.append({"store_artifacts": {"path": "installed.txt"}})

        all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
@ -138,11 +115,11 @@ class CircleCIJob:
        )

        steps.append({"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}})
-
        test_command = ""
        if self.command_timeout:
            test_command = f"timeout {self.command_timeout} "
-        test_command += f"python -m pytest -rs --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+        # junit familiy xunit1 is necessary to support splitting on test name or class name with circleci split
+        test_command += f"python3 -m pytest -rsfE -p no:warnings -o junit_family=xunit1 --tb=short --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags)

        if self.parallelism == 1:
            if self.tests_to_run is None:
@ -155,7 +132,7 @@ class CircleCIJob:
            if tests is None:
                folder = os.environ["test_preparation_dir"]
                test_file = os.path.join(folder, "filtered_test_list.txt")
-                if os.path.exists(test_file):
+                if os.path.exists(test_file): # We take this job's tests from the filtered test_list.txt
                    with open(test_file) as f:
                        tests = f.read().split(" ")

@ -167,17 +144,26 @@ class CircleCIJob:
                if test.endswith(".py"):
                    expanded_tests.append(test)
                elif test == "tests/models":
-                    expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)])
+                    if "tokenization" in self.name:
+                        expanded_tests.extend(glob.glob("tests/models/**/test_tokenization*.py", recursive=True))
+                    elif self.name in ["flax","torch","tf"]:
+                        name = self.name if self.name != "torch" else ""
+                        if self.name == "torch":
+                            all_tests = glob.glob(f"tests/models/**/test_modeling_{name}*.py", recursive=True) 
+                            filtered = [k for k in all_tests if ("_tf_") not in k and "_flax_" not in k]
+                            expanded_tests.extend(filtered)
+                        else:
+                            expanded_tests.extend(glob.glob(f"tests/models/**/test_modeling_{name}*.py", recursive=True))
+                    else:
+                        expanded_tests.extend(glob.glob("tests/models/**/test_modeling*.py", recursive=True))
                elif test == "tests/pipelines":
-                    expanded_tests.extend([os.path.join(test, x) for x in os.listdir(test)])
+                    expanded_tests.extend(glob.glob("tests/models/**/test_modeling*.py", recursive=True)) 
                else:
                    expanded_tests.append(test)
-            # Avoid long tests always being collected together
-            random.shuffle(expanded_tests)
            tests = " ".join(expanded_tests)

            # Each executor to run ~10 tests
-            n_executors = max(len(tests) // 10, 1)
+            n_executors = max(len(expanded_tests) // 10, 1)
            # Avoid empty test list on some executor(s) or launching too many executors
            if n_executors > self.parallelism:
                n_executors = self.parallelism
@ -190,13 +176,13 @@ class CircleCIJob:
            command = 'TESTS=$(circleci tests split tests.txt) && echo $TESTS > splitted_tests.txt'
            steps.append({"run": {"name": "Split tests", "command": command}})

-            steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
-            steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})
+            steps.append({"store_artifacts": {"path": "tests.txt"}})
+            steps.append({"store_artifacts": {"path": "splitted_tests.txt"}})

            test_command = ""
-            if self.timeout:
-                test_command = f"timeout {self.timeout} "
-            test_command += f"python -m pytest -rs -n {self.pytest_num_workers} " + " ".join(pytest_flags)
+            if self.command_timeout:
+                test_command = f"timeout {self.command_timeout} "
+            test_command += f"python3 -m pytest -rsfE -p no:warnings --tb=short  -o junit_family=xunit1 --junitxml=test-results/junit.xml -n {self.pytest_num_workers} " + " ".join(pytest_flags)
            test_command += " $(cat splitted_tests.txt)"
        if self.marker is not None:
            test_command += f" -m {self.marker}"
@ -211,61 +197,17 @@ class CircleCIJob:
            # failure.
            test_command = f"({test_command}) || true"
        else:
-            test_command = f"({test_command} | tee tests_output.txt) || true"
+            test_command = f"({test_command} | tee tests_output.txt)"
        steps.append({"run": {"name": "Run tests", "command": test_command}})

-        # Deal with errors
-        check_test_command = f'if [ -s reports/{self.job_name}/errors.txt ]; '
-        check_test_command += 'then echo "Some tests errored out!"; echo ""; '
-        check_test_command += f'cat reports/{self.job_name}/errors.txt; '
-        check_test_command += 'echo ""; echo ""; '
-
-        py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("ERROR ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()'
-        check_test_command += f"$(python3 -c '{py_command}'); "
-        check_test_command += 'cat summary_short.txt; echo ""; exit -1; '
-
-        # Deeal with failed tests
-        check_test_command += f'elif [ -s reports/{self.job_name}/failures_short.txt ]; '
-        check_test_command += 'then echo "Some tests failed!"; echo ""; '
-        check_test_command += f'cat reports/{self.job_name}/failures_short.txt; '
-        check_test_command += 'echo ""; echo ""; '
-
-        py_command = f'import os; fp = open("reports/{self.job_name}/summary_short.txt"); failed = os.linesep.join([x for x in fp.read().split(os.linesep) if x.startswith("FAILED ")]); fp.close(); fp = open("summary_short.txt", "w"); fp.write(failed); fp.close()'
-        check_test_command += f"$(python3 -c '{py_command}'); "
-        check_test_command += 'cat summary_short.txt; echo ""; exit -1; '
-
-        check_test_command += f'elif [ -s reports/{self.job_name}/stats.txt ]; then echo "All tests pass!"; '
-
-        # return code `124` means the previous (pytest run) step is timeout
-        if self.name == "pr_documentation_tests":
-            check_test_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; '
-
-        check_test_command += 'else echo "other fatal error"; echo ""; exit -1; fi;'
-
-        steps.append({"run": {"name": "Check test results", "command": check_test_command}})
+        steps.append({"run": {"name": "Skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}})
+        steps.append({"run": {"name": "Failed tests",  "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}})
+        steps.append({"run": {"name": "Errors",        "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}})

        steps.append({"store_test_results": {"path": "test-results"}})
-
-        steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
-        steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
-
-        # save cache at the end: so pytest step runs before cache saving and we can see results earlier
-        steps.append(
-            {
-                "save_cache": {
-                    "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-pip-" + '{{ checksum "setup.py" }}',
-                    "paths": ["~/.cache/pip"],
-                }
-            }
-        )
-        steps.append(
-            {
-                "save_cache": {
-                    "key": f"v{self.cache_version}-{self.cache_name}-{cache_branch_prefix}-site-packages-" + '{{ checksum "setup.py" }}',
-                    "paths": ["~/.pyenv/versions/"],
-                }
-            }
-        )
+        steps.append({"store_artifacts": {"path": "tests_output.txt"}})
+        steps.append({"store_artifacts": {"path": "test-results/junit.xml"}})
+        steps.append({"store_artifacts": {"path": "reports"}})

        job["steps"] = steps
        return job
@ -278,18 +220,9 @@ class CircleCIJob:
 # JOBS
 torch_and_tf_job = CircleCIJob(
    "torch_and_tf",
+    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
+    install_steps=["uv venv && uv pip install ."],
    additional_env={"RUN_PT_TF_CROSS_TESTS": True},
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs cmake",
-        "git lfs install",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
-        "pip install -U --upgrade-strategy eager tensorflow_probability",
-        # Without --no-deps we can't pin dependency versions in the future
-        "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate",
-        # TODO: remove this one after fixing the dependency issue(s) above
-        "pip install -U --upgrade-strategy eager torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu",
-    ],
    marker="is_pt_tf_cross_test",
    pytest_options={"rA": None, "durations": 0},
 )
@ -298,77 +231,61 @@ torch_and_tf_job = CircleCIJob(
 torch_and_flax_job = CircleCIJob(
    "torch_and_flax",
    additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install -U --upgrade-strategy eager --upgrade pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
-        # Without --no-deps we can't pin dependency versions in the future
-        "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate",
-    ],
+    docker_image=[{"image":"huggingface/transformers-torch-jax-light"}],
+    install_steps=["uv venv && uv pip install ."],
    marker="is_pt_flax_cross_test",
    pytest_options={"rA": None, "durations": 0},
 )

-
 torch_job = CircleCIJob(
    "torch",
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
-        # Without --no-deps we can't pin dependency versions in the future
-        "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate",
-    ],
-    parallelism=1,
-    pytest_num_workers=12,
+    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    install_steps=["uv venv && uv pip install ."],
+    parallelism=6,
+    pytest_num_workers=16
+)
+
+tokenization_job = CircleCIJob(
+    "tokenization",
+    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    install_steps=["uv venv && uv pip install ."],
+    parallelism=6,
+    pytest_num_workers=16
 )


 tf_job = CircleCIJob(
    "tf",
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
-        "pip install -U --upgrade-strategy eager tensorflow_probability",
-    ],
-    parallelism=1,
+    docker_image=[{"image":"huggingface/transformers-tf-light"}],
+    install_steps=["uv venv", "uv pip install -e."],
+    parallelism=6,
+    pytest_num_workers=16,
 )


 flax_job = CircleCIJob(
    "flax",
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece,flax-speech,vision]",
-    ],
-    parallelism=1,
+    docker_image=[{"image":"huggingface/transformers-jax-light"}],
+    install_steps=["uv venv && uv pip install ."],
+    parallelism=6,
+    pytest_num_workers=16
 )


 pipelines_torch_job = CircleCIJob(
    "pipelines_torch",
    additional_env={"RUN_PIPELINE_TESTS": True},
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
-    ],
+    docker_image=[{"image":"huggingface/transformers-torch-light"}],
+    install_steps=["uv venv && uv pip install ."],
    marker="is_pipeline_test",
-    pytest_num_workers=12,
 )


 pipelines_tf_job = CircleCIJob(
    "pipelines_tf",
    additional_env={"RUN_PIPELINE_TESTS": True},
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y cmake",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,tf-cpu,testing,sentencepiece,vision]",
-        "pip install -U --upgrade-strategy eager tensorflow_probability",
-    ],
+    docker_image=[{"image":"huggingface/transformers-tf-light"}],
+    install_steps=["uv venv && uv pip install ."],
    marker="is_pipeline_test",
 )

@ -376,22 +293,8 @@ pipelines_tf_job = CircleCIJob(
 custom_tokenizers_job = CircleCIJob(
    "custom_tokenizers",
    additional_env={"RUN_CUSTOM_TOKENIZERS": True},
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y cmake",
-        {
-            "name": "install jumanpp",
-            "command":
-                "wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz\n"
-                "tar xvf jumanpp-2.0.0-rc3.tar.xz\n"
-                "mkdir jumanpp-2.0.0-rc3/bld\n"
-                "cd jumanpp-2.0.0-rc3/bld\n"
-                "sudo cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local\n"
-                "sudo make install\n",
-        },
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]",
-        "python -m unidic download",
-    ],
+    docker_image=[{"image": "huggingface/transformers-custom-tokenizers"}],
+    install_steps=["uv venv","uv pip install -e ."],
    parallelism=None,
    resource_class=None,
    tests_to_run=[
@ -406,14 +309,9 @@ examples_torch_job = CircleCIJob(
    "examples_torch",
    additional_env={"OMP_NUM_THREADS": 8},
    cache_name="torch_examples",
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,torch,sentencepiece,testing,torch-speech]",
-        "pip install -U --upgrade-strategy eager -r examples/pytorch/_tests_requirements.txt",
-        # Without --no-deps we can't pin dependency versions in the future
-        "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate",
-    ],
+    docker_image=[{"image":"huggingface/transformers-examples-torch"}],
+    # TODO @ArthurZucker remove this once docker is easier to build
+    install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
    pytest_num_workers=1,
 )

@ -421,35 +319,20 @@ examples_torch_job = CircleCIJob(
 examples_tensorflow_job = CircleCIJob(
    "examples_tensorflow",
    cache_name="tensorflow_examples",
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y cmake",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[sklearn,tensorflow,sentencepiece,testing]",
-        "pip install -U --upgrade-strategy eager -r examples/tensorflow/_tests_requirements.txt",
-    ],
-)
-
-
-examples_flax_job = CircleCIJob(
-    "examples_flax",
-    cache_name="flax_examples",
-    install_steps=[
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[flax,testing,sentencepiece]",
-        "pip install -U --upgrade-strategy eager -r examples/flax/_tests_requirements.txt",
-    ],
+    docker_image=[{"image":"huggingface/transformers-examples-tf"}],
+    install_steps=["uv venv && uv pip install ."],
+    parallelism=8
 )


 hub_job = CircleCIJob(
    "hub",
    additional_env={"HUGGINGFACE_CO_STAGING": True},
+    docker_image=[{"image":"huggingface/transformers-torch-light"}],
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install git-lfs",
+        "uv venv && uv pip install .",
        'git config --global user.email "ci@dummy.com"',
        'git config --global user.name "ci"',
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[torch,sentencepiece,testing,vision]",
    ],
    marker="is_staging_test",
    pytest_num_workers=1,
@ -458,10 +341,11 @@ hub_job = CircleCIJob(

 onnx_job = CircleCIJob(
    "onnx",
+    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y cmake",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
+        "uv venv && uv pip install .",
+        "uv pip install --upgrade eager pip",
+        "uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
    ],
    pytest_options={"k onnx": None},
    pytest_num_workers=1,
@ -470,22 +354,8 @@ onnx_job = CircleCIJob(

 exotic_models_job = CircleCIJob(
    "exotic_models",
-    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[torch,testing,vision]",
-        "pip install -U --upgrade-strategy eager torchvision",
-        "pip install -U --upgrade-strategy eager scipy",
-        "pip install -U --upgrade-strategy eager 'git+https://github.com/facebookresearch/detectron2.git'",
-        "sudo apt install tesseract-ocr",
-        "pip install -U --upgrade-strategy eager pytesseract",
-        "pip install --upgrade-strategy eager sentencepiece",
-        "pip install -U --upgrade-strategy eager natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels",
-        "pip install -U --upgrade-strategy eager python-Levenshtein",
-        "pip install -U --upgrade-strategy eager opencv-python",
-        "pip install -U --upgrade-strategy eager nltk",
-        "pip uninstall -y torch torchvision torchaudio && pip install -U --upgrade-strategy eager 'torch<2.2.0' 'torchvision<0.17' 'torchaudio<2.2.0'"
-    ],
+    install_steps=["uv venv && uv pip install ."],
+    docker_image=[{"image":"huggingface/transformers-exotic-models"}],
    tests_to_run=[
        "tests/models/*layoutlmv*",
        "tests/models/*nat",
@ -493,17 +363,16 @@ exotic_models_job = CircleCIJob(
        "tests/models/udop",
        "tests/models/nougat",
    ],
-    pytest_num_workers=1,
+    pytest_num_workers=12,
+    parallelism=4,
    pytest_options={"durations": 100},
 )


 repo_utils_job = CircleCIJob(
    "repo_utils",
-    install_steps=[
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager .[quality,testing,torch]",
-    ],
+    docker_image=[{"image":"huggingface/transformers-consistency"}],
+    install_steps=["uv venv && uv pip install ."],
    parallelism=None,
    pytest_num_workers=1,
    resource_class="large",
@ -519,20 +388,9 @@ py_command = f"$(python3 -c '{py_command}')"
 command = f'echo "{py_command}" > pr_documentation_tests_temp.txt'
 doc_test_job = CircleCIJob(
    "pr_documentation_tests",
+    docker_image=[{"image":"huggingface/transformers-consistency"}],
    additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
    install_steps=[
-        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time ffmpeg",
-        "pip install --upgrade --upgrade-strategy eager pip",
-        "pip install -U --upgrade-strategy eager -e .[dev]",
-        # Without --no-deps we can't pin dependency versions in the future
-        "pip install -U --upgrade-strategy eager --no-deps -e git+https://github.com/huggingface/accelerate@main#egg=accelerate",
-        "pip install --upgrade --upgrade-strategy eager 'pytest<8.0.0' pytest-sugar",
-        "pip install -U --upgrade-strategy eager natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels",
-        "pip install -U --upgrade-strategy eager g2p-en",
-        # TODO: remove this one after fixing the dependency issue(s) above
-        "pip install -U --upgrade-strategy eager torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu",
-        "find -name __pycache__ -delete",
-        "find . -name \*.pyc -delete",
        # Add an empty file to keep the test step running correctly even no file is selected to be tested.
        "touch dummy.py",
        {
@ -566,11 +424,11 @@ REGULAR_TESTS = [
    hub_job,
    onnx_job,
    exotic_models_job,
+    tokenization_job
 ]
 EXAMPLES_TESTS = [
    examples_torch_job,
    examples_tensorflow_job,
-    examples_flax_job,
 ]
 PIPELINE_TESTS = [
    pipelines_torch_job,
--- a/.circleci/parse_test_outputs.py
+++ b/.circleci/parse_test_outputs.py
@ -0,0 +1,70 @@
+import re
+import argparse
+
+def parse_pytest_output(file_path):
+    skipped_tests = {}
+    skipped_count = 0
+    with open(file_path, 'r') as file:
+        for line in file:
+            match = re.match(r'^SKIPPED \[(\d+)\] (tests/.*): (.*)$', line)
+            if match:
+                skipped_count += 1
+                test_file, test_line, reason = match.groups()
+                skipped_tests[reason] = skipped_tests.get(reason, []) + [(test_file, test_line)]
+    for k,v in sorted(skipped_tests.items(), key=lambda x:len(x[1])):
+        print(f"{len(v):4} skipped because: {k}")
+    print("Number of skipped tests:", skipped_count)
+
+def parse_pytest_failure_output(file_path):
+    failed_tests = {}
+    failed_count = 0
+    with open(file_path, 'r') as file:
+        for line in file:
+            match = re.match(r'^FAILED (tests/.*) - (.*): (.*)$', line)
+            if match:
+                failed_count += 1
+                _, error, reason = match.groups()
+                failed_tests[reason] = failed_tests.get(reason, []) + [error]
+    for k,v in sorted(failed_tests.items(), key=lambda x:len(x[1])):
+        print(f"{len(v):4} failed because `{v[0]}` -> {k}")
+    print("Number of failed tests:", failed_count)
+    if failed_count>0:
+        exit(1)
+
+def parse_pytest_errors_output(file_path):
+    print(file_path)
+    error_tests = {}
+    error_count = 0
+    with open(file_path, 'r') as file:
+        for line in file:
+            match = re.match(r'^ERROR (tests/.*) - (.*): (.*)$', line)
+            if match:
+                error_count += 1
+                _, test_error, reason = match.groups()
+                error_tests[reason] = error_tests.get(reason, []) + [test_error]
+    for k,v in sorted(error_tests.items(), key=lambda x:len(x[1])):
+        print(f"{len(v):4} errored out because of `{v[0]}` -> {k}")
+    print("Number of errors:", error_count)
+    if error_count>0:
+        exit(1)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file", help="file to parse")
+    parser.add_argument("--skip", action="store_true", help="show skipped reasons")
+    parser.add_argument("--fail", action="store_true", help="show failed tests")
+    parser.add_argument("--errors", action="store_true", help="show failed tests")
+    args = parser.parse_args()
+
+    if args.skip:
+        parse_pytest_output(args.file)
+
+    if args.fail:
+        parse_pytest_failure_output(args.file)
+
+    if args.errors:
+        parse_pytest_errors_output(args.file)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/actions/post-slack/action.yml
+++ b/.github/actions/post-slack/action.yml
@ -1,79 +0,0 @@
-name: Send message to slack
-
-description: 'Send results to slack'
-author: 'Hugging Face'
-inputs:
-  slack_channel:
-    required: true
-    type: string
-  title:
-    required: true
-    type: string
-  status:
-    required: true
-    type: string
-  slack_token:
-    required: true
-    type: string
-
-runs:
-  using: "composite"
-  steps:
-    - name: Create content to post
-      id: create-message
-      run: |
-        if [ "${{ inputs.status }}" == "success" ]; then
-          echo STATUS_MESSAGE='🟢 Tests are passing!' >> $GITHUB_ENV
-        else
-          echo STATUS_MESSAGE='🔴 Tests failed! Please check the GitHub action link below' >> $GITHUB_ENV
-        fi
-      shell: bash
-
-    - name: Post Canceled results Slack channel
-      id: post-slack
-      uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
-      with:
-        # Slack channel id, channel name, or user id to post message.
-        # See also: https://api.slack.com/methods/chat.postMessage#channels
-        channel-id: ${{ inputs.slack_channel }}
-        # For posting a rich message using Block Kit
-        payload: |
-          {
-            "text": "${{ inputs.title }}",
-            "blocks": [
-              {
-                "type": "header",
-                "text": {
-                    "type": "plain_text",
-                    "text": "${{ inputs.title }}"
-                }
-              },
-              {
-                "type": "section",
-                "text": {
-                  "type": "mrkdwn",
-                  "text": "${{ env.STATUS_MESSAGE }}"
-                }
-              },
-              {
-                "type": "section",
-                "text": {"type": "mrkdwn", "text": "*Click the button for more details about the commit*"},
-                "accessory": {
-                    "type": "button",
-                    "text": {"type": "plain_text", "text": "Check Commit results"},
-                    "url": "${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
-                }
-              },
-              {
-                "type": "section",
-                "text": {"type": "mrkdwn", "text": "*Click here for more details about the action ran*"},
-                "accessory": {
-                    "type": "button",
-                    "text": {"type": "plain_text", "text": "Check Action results"},
-                    "url": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-                }
-              }
-            ]
-          }
-      env:
-        SLACK_BOT_TOKEN: ${{ inputs.slack_token }}
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@ -0,0 +1,54 @@
+name: Build pr ci-docker
+
+on:
+  push:
+    branches:
+      - change-ci # for now let's only build on this branch
+  repository_dispatch:
+  workflow_call:
+    inputs:
+      image_postfix:
+        required: true
+        type: string
+  schedule:
+    - cron: "6 0 * * *"
+
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+
+    if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' }}
+
+    strategy:
+      matrix:
+        file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "torch-jax-light", "jax-light", "examples-torch",  "examples-tf"]
+    continue-on-error: true 
+
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build ${{ matrix.file }}.dockerfile
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker
+          build-args: |
+            REF=${{ github.sha }}
+          file: "./docker/${{ matrix.file }}.dockerfile"
+          push: true
+          tags: huggingface/transformers-${{ matrix.file }}
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -97,7 +97,7 @@ jobs:
      
      - name: Post to Slack
        if: always()
-        uses: ./.github/actions/post-slack
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
          title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }}
@ -119,7 +119,7 @@ jobs:

      - name: Post to Slack
        if: always()
-        uses: ./.github/actions/post-slack
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main 
        with:
          slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
          title: 🤗 Results of the Integration tests - ${{ matrix.model-name }}
--- a/.github/workflows/self-new-model-pr-caller.yml
+++ b/.github/workflows/self-new-model-pr-caller.yml
@ -4,6 +4,11 @@ on:
  pull_request:
    paths:
      - "src/transformers/models/*/modeling_*.py"
+      - "tests/models/*/test_*.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true

 env:
  HF_HOME: /mnt/cache
@ -20,31 +25,46 @@ env:
  CUDA_VISIBLE_DEVICES: 0,1

 jobs:
-  check_for_new_model:
+  find_models_to_run:
      runs-on: ubuntu-22.04
-      name: Check if a PR is a new model PR
+      name: Find models to run slow tests
+      # Triggered only if the required label `run-slow` is added
+      if: ${{ contains(github.event.pull_request.labels.*.name, 'run-slow') }}
      outputs:
-        new_model: ${{ steps.check_new_model.outputs.new_model }}
+        models: ${{ steps.models_to_run.outputs.models }}
      steps:
        - uses: actions/checkout@v4
          with:
            fetch-depth: "0"
+            ref: ${{ github.event.pull_request.head.sha }}

-        - name: Check if there is a new model
-          id: check_new_model
+        - name: Get commit message
          run: |
+            echo "commit_message=$(git show -s --format=%s)" >> $GITHUB_ENV
+
+        - name: Get models to run slow tests
+          run: |
+            echo "${{ env.commit_message }}"
            python -m pip install GitPython
-            echo "new_model=$(python utils/check_if_new_model_added.py | tail -n 1)" >> $GITHUB_OUTPUT
+            python utils/pr_slow_ci_models.py --commit_message "${{ env.commit_message }}" | tee output.txt
+            echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
+
+        - name: Models to run slow tests
+          id: models_to_run
+          run: |
+            echo "${{ env.models }}"
+            echo "models=${{ env.models }}" >> $GITHUB_OUTPUT

  run_models_gpu:
-      name: Run all tests for the new model
-      # Triggered if it is a new model PR and the required label is added
-      if: ${{ needs.check_for_new_model.outputs.new_model != '' && contains(github.event.pull_request.labels.*.name, 'single-model-run-slow') }}
-      needs: check_for_new_model
+      name: Run all tests for the model
+      # Triggered only `find_models_to_run` is triggered (label `run-slow` is added) which gives the models to run
+      # (either a new model PR or via a commit message)
+      if: ${{ needs.find_models_to_run.outputs.models != '[]' }}
+      needs: find_models_to_run
      strategy:
        fail-fast: false
        matrix:
-          folders: ["${{ needs.check_for_new_model.outputs.new_model }}"]
+          folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }}
          machine_type: [single-gpu, multi-gpu]
      runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, ci]
      container:
--- a/.github/workflows/self-push-amd-mi300-caller.yml
+++ b/.github/workflows/self-push-amd-mi300-caller.yml
@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi300 CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi300
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi300
+    secrets: inherit
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@ -36,7 +36,7 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -57,7 +57,7 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -155,7 +155,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -230,7 +230,7 @@ jobs:
      - name: Run all non-slow selected tests on GPU
        working-directory: /transformers
        run: |
-          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@ -16,4 +16,5 @@ jobs:
    uses: ./.github/workflows/self-scheduled-amd.yml
    with:
      gpu_flavor: mi210
+      slack_report_channel: "#transformers-ci-daily-amd"
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@ -16,4 +16,5 @@ jobs:
    uses: ./.github/workflows/self-scheduled-amd.yml
    with:
      gpu_flavor: mi250
+      slack_report_channel: "#transformers-ci-daily-amd"
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi300-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi300-caller.yml
@ -0,0 +1,21 @@
+name: Self-hosted runner (AMD mi300 scheduled CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_scheduled_ci_caller*
+
+jobs:
+  run_amd_ci:
+    name: AMD mi300
+    needs: build-docker-containers
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      gpu_flavor: mi300
+      slack_report_channel: "#transformers-ci-daily-amd"
+    secrets: inherit
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@ -34,7 +34,7 @@ jobs:
          fetch-depth: 2

      - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+        run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

  check_runners:
    name: Check Runners
@ -42,7 +42,7 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -63,7 +63,7 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -116,7 +116,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -162,7 +162,7 @@ jobs:

      - name: Run all tests on GPU
        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
@ -184,7 +184,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
        machine_type: [multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -230,7 +230,7 @@ jobs:

      - name: Run all tests on GPU
        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
@ -250,7 +250,7 @@ jobs:
      fail-fast: false
      matrix:
        machine_type: [single-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -287,7 +287,7 @@ jobs:
        working-directory: /transformers
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
@ -307,7 +307,7 @@ jobs:
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -343,7 +343,7 @@ jobs:
      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
@ -364,7 +364,7 @@ jobs:
      matrix:
        machine_type: [single-gpu, multi-gpu]

-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
    needs: setup
    container:
      image: huggingface/transformers-pytorch-deepspeed-amd-gpu
@ -400,7 +400,7 @@ jobs:

      - name: Run all tests on GPU
        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -60,12 +60,10 @@ jobs:

      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
      - name: Failure table artifacts
-        # Only the model testing job is concerned for this step
-        if: ${{ inputs.job == 'run_models_gpu' }}
        uses: actions/upload-artifact@v4
        with:
-          name: prev_ci_results
-          path: prev_ci_results
+          name: ci_results_${{ inputs.job }}
+          path: ci_results_${{ inputs.job }}
      
      - uses: actions/checkout@v4
      - uses: actions/download-artifact@v4
@ -77,6 +75,7 @@ jobs:
          SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
          CI_EVENT: scheduled
          CI_SHA: ${{ github.sha }}
+          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
        # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
        # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
@ -85,3 +84,11 @@ jobs:
          pip install slack_sdk
          pip show slack_sdk
          python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" 
+
+      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
+      - name: Failure table artifacts
+        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ci_results_${{ inputs.job }}
+          path: ci_results_${{ inputs.job }}
--- a/README_fr.md
+++ b/README_fr.md
@ -288,7 +288,6 @@ Suivez les pages d'installation de Flax, PyTorch ou TensorFlow pour voir comment

 Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)

-
 🤗 Transformers fournit actuellement les architectures suivantes: consultez [ici](https://huggingface.co/docs/transformers/model_summary) pour un résumé global de chacune d'entre elles.

 Pour vérifier si chaque modèle a une implémentation en Flax, PyTorch ou TensorFlow, ou s'il a un tokenizer associé pris en charge par la bibliothèque 🤗 Tokenizers, consultez [ce tableau](https://huggingface.co/docs/transformers/index#supported-frameworks).
--- a/README_te.md
+++ b/README_te.md
@ -293,7 +293,6 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా

 🤗 ట్రాన్స్‌ఫార్మర్లు ప్రస్తుతం కింది ఆర్కిటెక్చర్‌లను అందజేస్తున్నాయి: వాటిలో ప్రతి ఒక్కటి ఉన్నత స్థాయి సారాంశం కోసం [ఇక్కడ](https://huggingface.co/docs/transformers/model_summary) చూడండి.

-
 ఈ అమలులు అనేక డేటాసెట్‌లలో పరీక్షించబడ్డాయి (ఉదాహరణ స్క్రిప్ట్‌లను చూడండి) మరియు అసలైన అమలుల పనితీరుతో సరిపోలాలి. మీరు [డాక్యుమెంటేషన్](https://github.com/huggingface/transformers/tree/main/examples) యొక్క ఉదాహరణల విభాగంలో పనితీరుపై మరిన్ని వివరాలను కనుగొనవచ్చు.

 ## ఇంకా నేర్చుకో
--- a/conftest.py
+++ b/conftest.py
@ -71,7 +71,7 @@ NOT_DEVICE_TESTS = {
    "ModelTester::test_pipeline_",
    "/repo_utils/",
    "/utils/",
-    "/tools/",
+    "/agents/",
 }

 # allow having multiple repository checkouts and not needing to remember to rerun
@ -94,7 +94,7 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
-    config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule")
+    config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")


--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -0,0 +1,14 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update && apt-get install -y time git pkg-config make git-lfs
+ENV VIRTUAL_ENV=/usr/local
+RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
+RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir tensorflow-cpu tf-keras
+RUN uv pip install --no-cache-dir "transformers[flax,quality,vision,testing]"
+RUN git lfs install
+
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -0,0 +1,26 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+
+RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
+RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
+RUN mkdir jumanpp-2.0.0-rc3/bld
+WORKDIR ./jumanpp-2.0.0-rc3/bld
+RUN wget -LO catch.hpp https://github.com/catchorg/Catch2/releases/download/v2.13.8/catch.hpp
+RUN mv catch.hpp ../libs/
+RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+RUN make install -j 10
+
+
+RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install  --no-cache-dir "transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
+# spacy is not used so not tested. Causes to failures. TODO fix later
+RUN python3 -m unidic download
+RUN pip uninstall -y transformers
+
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@ -0,0 +1,12 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
+RUN apt-get install -y g++ cmake
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv
+RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
+RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -0,0 +1,11 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -0,0 +1,17 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir  --no-deps timm accelerate
+RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
+# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
+RUN pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
+# RUN git clone https://github.com/facebookresearch/detectron2.git
+# RUN python3 -m pip install --no-cache-dir -e detectron2
+RUN pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3'
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@ -0,0 +1,9 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@ -0,0 +1,9 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir "transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -0,0 +1,10 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]"
+RUN pip uninstall -y transformers
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@ -0,0 +1,8 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update && apt-get install -y time git 
+ENV VIRTUAL_ENV=/usr/local
+RUN pip install uv &&  uv venv
+RUN uv pip install --no-cache-dir -U pip setuptools GitPython transformers "ruff==0.1.5" urllib3
+RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@ -0,0 +1,11 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
+RUN apt-get install -y  cmake
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@ -0,0 +1,15 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-deps accelerate
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax, audio, sklearn,sentencepiece,vision,testing]"
+
+
+# RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"
+
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -0,0 +1,10 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]"
+RUN pip uninstall -y transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -0,0 +1,19 @@
+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+RUN echo ${REF}
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
+ENV VIRTUAL_ENV=/usr/local
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN git lfs install
+
+RUN uv pip install --no-cache-dir pypi-kenlm
+RUN pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" librosa
+
+
+RUN pip uninstall -y transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -9,11 +9,11 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.2.1'
+ARG PYTORCH='2.3.0'
 # (not always a valid torch version)
-ARG INTEL_TORCH_EXT='2.2.0'
+ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu118'
+ARG CUDA='cu121'

 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
@ -45,6 +45,9 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 # For video model testing
 RUN python3 -m pip install --no-cache-dir decord av==9.2.0

+# For GGUF tests
+RUN python3 -m pip install --no-cache-dir gguf
+
 # Some slow tests require bnb
 RUN python3 -m pip install --no-cache-dir bitsandbytes

--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,24 +1,19 @@
-FROM rocm/dev-ubuntu-20.04:5.6
+FROM rocm/dev-ubuntu-22.04:6.0.2
 # rocm/pytorch has no version with 2.1.0
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

-ARG PYTORCH='2.1.0'
-ARG TORCH_VISION='0.16.0'
-ARG TORCH_AUDIO='2.1.0'
-ARG ROCM='5.6'
-
 RUN apt update && \
-    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip ffmpeg && \
+    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg && \
    apt clean && \
    rm -rf /var/lib/apt/lists/*

-RUN python3 -m pip install --no-cache-dir --upgrade pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip numpy

-RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM
+RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0

-RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
+RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"

 ARG REF=main
 WORKDIR /
@ -35,5 +30,5 @@ RUN python3 -m pip uninstall -y tensorflow flax
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop

-# Remove nvml as it is not compatible with ROCm
-RUN python3 -m pip uninstall py3nvml pynvml -y
+# Remove nvml as it is not compatible with ROCm. apex is not tested on NVIDIA either.
+RUN python3 -m pip uninstall py3nvml pynvml apex -y
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -42,7 +42,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -11,7 +11,7 @@ ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

 # If set to nothing, will install the latest version
-ARG PYTORCH='2.1.1'
+ARG PYTORCH='2.3.0'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -45,6 +45,9 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 # Add aqlm for quantization testing
 RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2

+# Add hqq for quantization testing
+RUN python3 -m pip install --no-cache-dir hqq
+
 # Add autoawq for quantization testing
 # >=v0.2.3 needed for compatibility with torch 2.2.1
 RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
@ -57,4 +60,4 @@ RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git

 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
-RUN cd transformers && python3 setup.py develop
+RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -23,7 +23,7 @@
    title: Load and train adapters with 🤗 PEFT
  - local: model_sharing
    title: Share your model
-  - local: transformers_agents
+  - local: agents
    title: Agents
  - local: llm_tutorial
    title: Generation with LLMs
@ -133,12 +133,12 @@
    title: Notebooks with examples
  - local: community
    title: Community resources
-  - local: custom_tools
-    title: Custom Tools and Prompts
  - local: troubleshooting
    title: Troubleshoot
  - local: hf_quantizer
    title: Contribute new quantization method
+  - local: gguf
+    title: Interoperability with GGUF files
  title: Developer guides
 - sections:
  - local: performance
@ -388,6 +388,8 @@
        title: I-BERT
      - local: model_doc/jamba
        title: Jamba
+      - local: model_doc/jetmoe
+        title: JetMoe
      - local: model_doc/jukebox
        title: Jukebox
      - local: model_doc/led
@ -784,6 +786,8 @@
        title: OWL-ViT
      - local: model_doc/owlv2
        title: OWLv2
+      - local: model_doc/paligemma
+        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
      - local: model_doc/pix2struct
@ -804,6 +808,8 @@
        title: TVP
      - local: model_doc/udop
        title: UDOP
+      - local: model_doc/video_llava
+        title: VideoLlava
      - local: model_doc/vilt
        title: ViLT
      - local: model_doc/vipllava
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -0,0 +1,490 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# Agents and tools
+
+[[open-in-colab]]
+
+### What is an agent?
+
+Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling.) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
+
+One approach to overcome this weakness is to create an *agent*.
+
+An agent is a system that uses an LLM as its engine, and it has access to functions called *tools*.
+
+These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them.
+
+The agent can be programmed to:
+- devise a series of actions/tools and run them all at once like the `CodeAgent` for example
+- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one like the `ReactJsonAgent` for example
+
+### Types of agents
+
+#### Code agent
+
+This agent has a planning step, then generates python code to execute all its actions at once. It natively handles different input and output types for its tools, thus it is the recommended choice for multimodal tasks.
+
+#### React agents
+
+This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations.
+
+We implement two versions of ReactJsonAgent: 
+- [`~ReactJsonAgent`] generates tool calls as a JSON in its output.
+- [`~ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance.
+
+> [!TIP]
+> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more the ReAct agent.
+
+![Framework of a React Agent](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
+
+For example, here is how a ReAct agent would work its way through the following question.
+
+```py3
+>>> agent.run(
+...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+... )
+=====New task=====
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
+====Agent is executing the code below:
+bert_blocks = search(query="number of blocks in BERT base encoder")
+print("BERT blocks:", bert_blocks)
+====
+Print outputs:
+BERT blocks: twelve encoder blocks
+
+====Agent is executing the code below:
+attention_layer = search(query="number of layers in Attention is All You Need")
+print("Attention layers:", attention_layer)
+====
+Print outputs:
+Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
+
+====Agent is executing the code below:
+bert_blocks = 12
+attention_layers = 6
+diff = bert_blocks - attention_layers
+print("Difference in blocks:", diff)
+final_answer(diff)
+====
+
+Print outputs:
+Difference in blocks: 6
+
+Final answer: 6
+```
+
+### How can I build an agent?
+
+To initialize an agent, you need these arguments:
+
+- an LLM to power your agent - the agent is not exactly the LLM, it’s more like the agent is a program that uses an LLM as its engine.
+- a system prompt: what the LLM engine will be prompted with to generate its output
+- a toolbox from which the agent pick tools to execute
+- a parser to extract from the LLM output which tools are to call and with which arguments
+
+Upon initialization of the agent system, the tool attributes are used to generate a tool description, then baked into the agent’s `system_prompt` to let it know which tools it can use and why.
+
+To start with, please install the `agents` extras in order to install all default dependencies.
+
+```bash
+pip install transformers[agents]
+```
+
+Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating.) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
+
+```python
+from huggingface_hub import login, InferenceClient
+
+login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
+
+client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
+
+def llm_engine(messages, stop_sequences=["Task"]) -> str:
+    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
+    answer = response.choices[0].message.content
+    return answer
+```
+
+You could use any `llm_engine` method as long as:
+1. it follows the [messages format](./chat_templating.md) for its input (`List[Dict[str, str]]`) and returns a `str`
+2. it stops generating outputs at the sequences passed in the argument `stop`
+
+You also need a `tools` argument which accepts a list of `Tools`. You can provide an empty list for `tools`, but use the default toolbox with the optional argument `add_base_tools=True`.
+
+Now you can create an agent, like `CodeAgent`, and run it. For convenience, we also provide the `HfEngine` class that uses `huggingface_hub.InferenceClient` under the hood.
+
+```python
+from transformers import CodeAgent, HfEngine
+
+llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run(
+    "Could you translate this sentence from French, say it out loud and return the audio.",
+    sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+This will be handy in case of emergency baguette need!
+You can even leave the argument `llm_engine` undefined, and an [~HfEngine] will be created by default.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], add_base_tools=True)
+
+agent.run(
+    "Could you translate this sentence from French, say it out loud and give me the audio.",
+    sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+Note that we used an additional `sentence` argument: you can pass text as additional arguments to the model.
+
+You can also use this to indicate the path to local or remote files for the model to use:
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+```
+
+
+The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent.
+
+```python
+print(agent.system_prompt_template)
+```
+
+It's important to explain as clearly as possible the task you want to perform.
+Every [`~Agent.run`] operation is independent, and since an agent is powered by an LLM, minor variations in your prompt might yield completely different results.
+You can also run an agent consecutively for different tasks: each time the attributes `agent.task` and `agent.logs` will be re-initialized.
+
+
+#### Code execution
+
+A Python interpreter executes the code on a set of inputs passed along with your tools.
+This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed.
+
+The Python interpreter also doesn't allow any attribute lookup or imports (which shouldn't be needed for passing inputs/outputs to a small set of functions) so all the most obvious attacks shouldn't be an issue.
+
+The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent.
+
+### The system prompt
+
+An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the `ReactCodeAgent` (below version is slightly simplified).
+
+```text
+You will be given a task to solve as best you can.
+You have access to the following tools:
+<<tool_descriptions>>
+
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
+Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
+
+In the end you have to return a final answer using the `final_answer` tool.
+
+Here are a few examples using notional tools:
+---
+{examples}
+
+Above example were using notional tools that might not exist for you. You only have acces to those tools:
+<<tool_names>>
+You also can perform computations in the python code you generate.
+
+Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
+
+Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
+Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
+
+Remember to make sure that variables you use are all defined.
+
+Now Begin!
+```
+
+The system prompt includes:
+- An *introduction* that explains how the agent should behave and what tools are.
+- A description of all the tools that is defined by a `<<tool_descriptions>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
+    - The tool description comes from the tool attributes, `name`, `description`, `inputs` and `output_type`,  and a simple `jinja2` template that you can refine.
+- The expected output format.
+
+You could improve the system prompt, for example, by adding an explanation of the output format.
+
+For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter.
+
+```python
+from transformers import ReactJsonAgent
+from transformers.agents import PythonInterpreterTool
+
+agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
+```
+
+> [!WARNING]
+> Please make sure to define the `<<tool_descriptions>>` string somewhere in the `template` so the agent is aware 
+of the available tools.
+
+## Tools
+
+A tool is an atomic function to be used by an agent.
+
+You can for instance check the [~PythonInterpreterTool]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action.
+
+When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why.
+
+### Default toolbox
+
+Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`:
+
+- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
+- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
+- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
+- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
+- **Translation**: translates a given sentence from source language to target language.
+- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [~ReactJsonAgent] if you use `add_base_tools=True`, since code-based tools can already execute Python code
+
+
+You can manually use a tool by calling the [`load_tool`] function and a task to perform.
+
+
+```python
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+
+### Create a new tool
+
+You can create your own tool for use cases not covered by the default tools from Hugging Face.
+For example, let's create a tool that returns the most downloaded model for a given task from the Hub.
+
+You'll start with the code below.
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+This code can be converted into a class that inherits from the [`Tool`] superclass.
+
+
+The custom tool needs:
+- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`.
+- An attribute `description` is used to populate the agent's system prompt.
+- An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input.
+- An `output_type` attribute, which specifies the output type.
+- A `forward` method which contains the inference code to be executed.
+
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = (
+        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+        "It returns the name of the checkpoint."
+    )
+
+    inputs = {
+        "task": {
+            "type": "text",
+            "description": "the task category (such as text-classification, depth-estimation, etc)",
+        }
+    }
+    output_type = "text"
+
+    def forward(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use.
+
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
+
+```python
+tool.push_to_hub("{your_username}/hf-model-downloads")
+```
+
+Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent.
+
+```python
+from transformers import load_tool, CodeAgent
+
+model_download_tool = load_tool("m-ric/hf-model-downloads")
+agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
+agent.run(
+    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+
+You get the following:
+```text
+======== New task ========
+Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
+==== Agent is executing the code below:
+most_downloaded_model = model_download_counter(task="text-to-video")
+print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
+====
+```
+
+And the output:
+`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
+
+
+### Manage agent toolbox
+
+If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool.
+
+Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+agent.toolbox.add_tool(model_download_tool)
+```
+Now we can leverage both the new tool and the previous text-to-speech tool:
+
+```python
+agent.run(
+    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+)
+```
+
+
+| **Audio**                                                                                                                                            |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
+
+
+> [!WARNING]
+> Beware when adding tools to an agent that already works well because it can bias selection towards your tool or select another tool other than the one already defined.
+
+
+Use the `agent.toolbox.update_tool()` method to replace an existing tool in the agent's toolbox.
+This is useful if your new tool is a one-to-one replacement of the existing tool because the agent already knows how to perform that specific task.
+Just make sure the new tool follows the same API as the replaced tool or adapt the system prompt template to ensure all examples using the replaced tool are updated.
+
+
+### Use a collection of tools
+
+You can leverage tool collections by using the ToolCollection object, with the slug of the collection you want to use.
+Then pass them as a list to initialize you agent, and start using them!
+
+```py
+from transformers import ToolCollection, ReactCodeAgent
+
+image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
+agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
+
+agent.run("Please draw me a picture of rivers and lakes.")
+```
+
+To speed up the start, tools are loaded only if called by the agent.
+
+This gets you this image:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png">
+
+
+### Use gradio-tools
+
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
+Face Spaces as tools. It supports many existing Spaces as well as custom Spaces.
+
+Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images.
+
+Import and instantiate the tool, then pass it to the `Tool.from_gradio` method:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+from transformers import Tool, load_tool, CodeAgent
+
+gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
+prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
+```
+
+Now you can use it just like any other tool. For example, let's improve the prompt  `a rabbit wearing a space suit`.
+
+```python
+image_generation_tool = load_tool('huggingface-tools/text-to-image')
+agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine)
+
+agent.run(
+    "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
+)
+```
+
+The model adequately leverages the tool:
+```text
+======== New task ========
+Improve this prompt, then generate an image of it.
+You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}.
+==== Agent is executing the code below:
+improved_prompt = StableDiffusionPromptGenerator(query=prompt)
+while improved_prompt == "QUEUE_FULL":
+    improved_prompt = StableDiffusionPromptGenerator(query=prompt)
+print(f"The improved prompt is {improved_prompt}.")
+image = image_generator(prompt=improved_prompt)
+====
+```
+
+Before finally generating the image:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
+
+
+> [!WARNING]
+> gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible.
+
+### Use LangChain tools
+
+We love Langchain and think it has a very compelling suite of tools.
+To import a tool from LangChain, use the `from_langchain()` method.
+
+Here is how you can use it to recreate the intro's search result using a LangChain web search tool.
+
+```python
+from langchain.agents import load_tools
+from transformers import Tool, ReactCodeAgent
+
+search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
+
+agent = ReactCodeAgent(tools=[search_tool])
+
+agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
+```
--- a/docs/source/en/custom_tools.md
+++ b/docs/source/en/custom_tools.md
@ -1,798 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Custom Tools and Prompts
-
-<Tip>
-
-If you are not aware of what tools and agents are in the context of transformers, we recommend you read the
-[Transformers Agents](transformers_agents) page first.
-
-</Tip>
-
-<Tip warning={true}>
-
-Transformers Agents is an experimental API that is subject to change at any time. Results returned by the agents
-can vary as the APIs or underlying models are prone to change.
-
-</Tip>
-
-Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks.
-In this guide we'll take a look at:
-
- How to customize the prompt
- How to use custom tools
- How to create custom tools
-
-## Customizing the prompt
-
-As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode.
-Both the `run` and `chat` modes underlie the same logic. The language model powering the agent is conditioned on a long 
-prompt and completes the prompt by generating the next tokens until the stop token is reached.
-The only difference between the two modes is that during the `chat` mode the prompt is extended with 
-previous user inputs and model generations. This allows the agent to have access to past interactions,
-seemingly giving the agent some kind of memory.
-
-### Structure of the prompt
-
-Let's take a closer look at how the prompt is structured to understand how it can be best customized.
-The prompt is structured broadly into four parts.
-
-1. Introduction: how the agent should behave, explanation of the concept of tools.
-2. Description of all the tools. This is defined by a `<<all_tools>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
-3. A set of examples of tasks and their solution
-4. Current example, and request for solution.
-
-To better understand each part, let's look at a shortened version of how the `run` prompt can look like:
-
-````text
-I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
-[...]
-You can print intermediate results if it makes sense to do so.
-
-Tools:
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
-[...]
-
-Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
-
-I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
-
-Answer:
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(image=image, question=translated_question)
-print(f"The answer is {answer}")
-```
-
-Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-[...]
-
-Task: "Draw me a picture of rivers and lakes"
-
-I will use the following
-````
-
-The introduction (the text before *"Tools:"*) explains precisely how the model shall behave and what it should do.
-This part most likely does not need to be customized as the agent shall always behave the same way.
-
-The second part (the bullet points below *"Tools"*) is dynamically added upon calling `run` or `chat`. There are 
-exactly as many bullet points as there are tools in `agent.toolbox` and each bullet point consists of the name 
-and description of the tool:
-
-```text
- <tool.name>: <tool.description>
-```
-
-Let's verify this quickly by loading the document_qa tool and printing out the name and description.
-
-```py
-from transformers import load_tool
-
-document_qa = load_tool("document-question-answering")
-print(f"- {document_qa.name}: {document_qa.description}")
-```
-
-which gives:
-```text
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
-```
-
-We can see that the tool name is short and precise. The description includes two parts, the first explaining 
-what the tool does and the second states what input arguments and return values are expected.
-
-A good tool name and tool description are very important for the agent to correctly use it. Note that the only
-information the agent has about the tool is its name and description, so one should make sure that both 
-are precisely written and match the style of the existing tools in the toolbox. In particular make sure the description
-mentions all the arguments expected by name in code-style, along with the expected type and a description of what they
-are.
-
-<Tip>
-
-Check the naming and description of the curated Transformers tools to better understand what name and 
-description a tool is expected to have. You can see all tools with the [`Agent.toolbox`] property.
-
-</Tip>
-
-The third part includes a set of curated examples that show the agent exactly what code it should produce
-for what kind of user request. The large language models empowering the agent are extremely good at 
-recognizing patterns in a prompt and repeating the pattern with new data. Therefore, it is very important
-that the examples are written in a way that maximizes the likelihood of the agent to generating correct,
-executable code in practice. 
-
-Let's have a look at one example:
-
-````text
-Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-````
-
-The pattern the model is prompted to repeat has three parts: The task statement, the agent's explanation of 
-what it intends to do, and finally the generated code. Every example that is part of the prompt has this exact 
-pattern, thus making sure that the agent will reproduce exactly the same pattern when generating new tokens.
-
-The prompt examples are curated by the Transformers team and rigorously evaluated on a set of 
-[problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)
-to ensure that the agent's prompt is as good as possible to solve real use cases of the agent.
-
-The final part of the prompt corresponds to:
-```text
-Task: "Draw me a picture of rivers and lakes"
-
-I will use the following
-```
-
-is a final and unfinished example that the agent is tasked to complete. The unfinished example
-is dynamically created based on the actual user input. For the above example, the user ran:
-
-```py
-agent.run("Draw me a picture of rivers and lakes")
-```
-
-The user input - *a.k.a* the task: *"Draw me a picture of rivers and lakes"* is cast into the 
-prompt template: "Task: <task> \n\n I will use the following". This sentence makes up the final lines of the 
-prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example 
-exactly in the same way it was previously done in the examples.
-
-Without going into too much detail, the chat template has the same prompt structure with the 
-examples having a slightly different style, *e.g.*:
-
-````text
-[...]
-
-=====
-
-Human: Answer the question in the variable `question` about the image stored in the variable `image`.
-
-Assistant: I will use the tool `image_qa` to answer the question on the input image.
-
-```py
-answer = image_qa(text=question, image=image)
-print(f"The answer is {answer}")
-```
-
-Human: I tried this code, it worked but didn't give me a good result. The question is in French
-
-Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
-
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(text=translated_question, image=image)
-print(f"The answer is {answer}")
-```
-
-=====
-
-[...]
-````
-
-Contrary, to the examples of the `run` prompt, each `chat` prompt example has one or more exchanges between the 
-*Human* and the *Assistant*. Every exchange is structured similarly to the example of the `run` prompt. 
-The user's input is appended to behind *Human:* and the agent is prompted to first generate what needs to be done 
-before generating code. An exchange can be based on previous exchanges, therefore allowing the user to refer
-to past exchanges as is done *e.g.* above by the user's input of "I tried **this** code" refers to the 
-previously generated code of the agent.
-
-Upon running `.chat`, the user's input or *task* is cast into an unfinished example of the form:
-```text
-Human: <user-input>\n\nAssistant:
-```
-which the agent completes. Contrary to the `run` command, the `chat` command then appends the completed example
-to the prompt, thus giving the agent more context for the next `chat` turn.
-
-Great now that we know how the prompt is structured, let's see how we can customize it!
-
-### Writing good user inputs
-
-While large language models are getting better and better at understanding users' intentions, it helps 
-enormously to be as precise as possible to help the agent pick the correct task. What does it mean to be 
-as precise as possible?
-
-The agent sees a list of tool names and their description in its prompt. The more tools are added the 
-more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose
-the correct sequences of tools to run. Let's look at a common failure case, here we will only return 
-the code to analyze it.
-
-```py
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-
-agent.run("Show me a tree", return_code=True)
-```
-
-gives:
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
-
-
-==Code generated by the agent==
-mask = image_segmenter(image, prompt="tree")
-```
-
-which is probably not what we wanted. Instead, it is more likely that we want an image of a tree to be generated.
-To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that 
-are present in the tool's name and description. Let's have a look.
-```py
-agent.toolbox["image_generator"].description
-```
-
-```text
-'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
-```
-
-The name and description make use of the keywords "image", "prompt", "create" and "generate". Using these words will most likely work better here. Let's refine our prompt a bit.
-
-```py
-agent.run("Create an image of a tree", return_code=True)
-```
-
-gives:
-```text
-==Explanation from the agent==
-I will use the following tool `image_generator` to generate an image of a tree.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="tree")
-```
-
-Much better! That looks more like what we want. In short, when you notice that the agent struggles to 
-correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name
-and description and try refining your task request with it.
-
-### Customizing the tool descriptions
-
-As we've seen before the agent has access to each of the tools' names and descriptions. The base tools 
-should have very precise names and descriptions, however, you might find that it could help to change 
-the description or name of a tool for your specific use case. This might become especially important 
-when you've added multiple tools that are very similar or if you want to use your agent only for a certain 
-domain, *e.g.* image generation and transformations.
-
-A common problem is that the agent confuses image generation with image transformation/modification when 
-used a lot for image generation tasks, *e.g.*
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-returns
-```text
-==Explanation from the agent== 
-I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-house_car_image = image_transformer(image=car_image, prompt="A house")
-```
-
-which is probably not exactly what we want here. It seems like the agent has a difficult time 
-to understand the difference between `image_generator` and `image_transformer` and often uses the two together.
-
-We can help the agent here by changing the tool name and description of `image_transformer`. Let's instead call it `modifier`
-to disassociate it a bit from "image" and "prompt":
-```py
-agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
-agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
-    "transforms an image according to a prompt", "modifies an image"
-)
-```
-
-Now "modify" is a strong cue to use the new image processor which should help with the above prompt. Let's run it again.
-
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-
-Now we're getting:
-```text
-==Explanation from the agent==
-I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
-
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-```
-
-which is definitely closer to what we had in mind! However, we want to have both the house and car in the same image. Steering the task more toward single image generation should help:
-
-```py
-agent.run("Create image: 'A house and car'", return_code=True)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_generator` to generate an image.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="A house and car")
-```
-
-<Tip warning={true}>
-
-Agents are still brittle for many use cases, especially when it comes to 
-slightly more complex use cases like generating an image of multiple objects.
-Both the agent itself and the underlying prompt will be further improved in the coming 
-months making sure that agents become more robust to a variety of user inputs.
-
-</Tip>
-
-### Customizing the whole prompt
-
-To give the user maximum flexibility, the whole prompt template as explained in [above](#structure-of-the-prompt)
-can be overwritten by the user. In this case make sure that your custom prompt includes an introduction section, 
-a tool section, an example section, and an unfinished example section. If you want to overwrite the `run` prompt template, 
-you can do as follows:
-
-```py
-template = """ [...] """
-
-agent = HfAgent(your_endpoint, run_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-Please make sure to have the `<<all_tools>>` string and the `<<prompt>>` defined somewhere in the `template` so that the agent can be aware 
-of the tools, it has available to it as well as correctly insert the user's prompt.
-
-</Tip>
-
-Similarly, one can overwrite the `chat` prompt template. Note that the `chat` mode always uses the following format for the exchanges:
-```text
-Human: <<task>>
-
-Assistant:
-```
-
-Therefore it is important that the examples of the custom `chat` prompt template also make use of this format.
-You can overwrite the `chat` template at instantiation as follows.
-
-```python
-template = """ [...] """
-
-agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be aware 
-of the tools, it has available to it.
-
-</Tip>
-
-In both cases, you can pass a repo ID instead of the prompt template if you would like to use a template hosted by someone in the community. The default prompts live in [this repo](https://huggingface.co/datasets/huggingface-tools/default-prompts) as an example.
-
-To upload your custom prompt on a repo on the Hub and share it with the community just make sure:
- to use a dataset repository
- to put the prompt template for the `run` command in a file named `run_prompt_template.txt`
- to put the prompt template for the `chat` command in a file named `chat_prompt_template.txt`
-
-## Using custom tools
-
-<Tip warning={true}>
-
-Using custom tools in your local runtime means that you'll download code to run on your machine.
-
-ALWAYS inspect the tool you're downloading before loading it within your runtime, as you would do when
-installing a package using pip/npm/apt.
-
-</Tip>
-
-In this section, we'll be leveraging two existing custom tools that are specific to image generation:
-
- We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation),
-  with [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool) 
-  to allow for more image modifications.
- We add a new tool for image upscaling to the default toolbox: 
-  [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) replace the existing image-transformation tool.
-
-We'll start by loading the custom tools with the convenient [`load_tool`] function:
-
-```py
-from transformers import load_tool
-
-controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
-upscaler = load_tool("diffusers/latent-upscaler-tool")
-```
-
-Upon adding custom tools to an agent, the tools' descriptions and names are automatically
-included in the agents' prompts. Thus, it is imperative that custom tools have
-a well-written description and name in order for the agent to understand how to use them.
-Let's take a look at the description and name of `controlnet_transformer`:
-
-```py
-print(f"Description: '{controlnet_transformer.description}'")
-print(f"Name: '{controlnet_transformer.name}'")
-```
-
-gives 
-```text
-Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
-It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
-Name: 'image_transformer'
-```
-
-The name and description are accurate and fit the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools).
-Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`:
-
-```py
-tools = [controlnet_transformer, upscaler]
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
-```
-
-This command should give you the following info:
-
-```text
-image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
-8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
-```
-
-The set of curated tools already has an `image_transformer` tool which is hereby replaced with our custom tool.
-
-<Tip>
-
-Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool 
-because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API 
-as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that
-tool are updated.
-
-</Tip>
-
-The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore simply added to the list of tools.
-You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute:
-
-```py
-print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
-```
-
-```text
- document_qa
- image_captioner
- image_qa
- image_segmenter
- transcriber
- summarizer
- text_classifier
- text_qa
- text_reader
- translator
- image_transformer
- text_downloader
- image_generator
- video_generator
- image_upscaler
-```
-
-Note how `image_upscaler` is now part of the agents' toolbox.
-
-Let's now try out the new tools! We will re-use the image we generated in [Transformers Agents Quickstart](./transformers_agents#single-execution-run).
-
-```py
-from diffusers.utils import load_image
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
-)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-Let's transform the image into a beautiful winter landscape:
-
-```py
-image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_transformer` to transform the image.
-
-
-==Code generated by the agent==
-image = image_transformer(image, prompt="A frozen lake and snowy forest")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
-
-The new image processing tool is based on ControlNet which can make very strong modifications to the image.
-By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it.
-
-```py
-image = agent.run("Upscale the image", image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_upscaler` to upscale the image.
-
-
-==Code generated by the agent==
-upscaled_image = image_upscaler(image)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
-
-The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool 
-and was able to correctly run it.
-
-Next, let's have a look at how you can create a new custom tool.
-
-### Adding new tools
-
-In this section, we show how to create a new tool that can be added to the agent.
-
-#### Creating a new tool
-
-We'll first start by creating a tool. We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face
-Hub with the most downloads for a given task.
-
-We can do that with the following code:
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-
-For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'google-t5/t5-base`.
-
-How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the
-main attributes necessary. We'll create a class that inherits from it:
-
-```python
-from transformers import Tool
-
-
-class HFModelDownloadsTool(Tool):
-    pass
-```
-
-This class has a few needs:
- An attribute `name`, which corresponds to the name of the tool itself. To be in tune with other tools which have a
-  performative name, we'll name it `model_download_counter`.
- An attribute `description`, which will be used to populate the prompt of the agent.
- `inputs` and `outputs` attributes. Defining this will help the python interpreter make educated choices about types,
-  and will allow for a gradio-demo to be spawned when we push our tool to the Hub. They're both a list of expected
-  values, which can be `text`, `image`, or `audio`.
- A `__call__` method which contains the inference code. This is the code we've played with above!
-
-Here's what our class looks like now:
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = (
-        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
-        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
-        "returns the name of the checkpoint."
-    )
-
-    inputs = ["text"]
-    outputs = ["text"]
-
-    def __call__(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-We now have our tool handy. Save it in a file and import it from your main script. Let's name this file
-`model_downloads.py`, so the resulting import code looks like this:
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your 
-namespace. To do so, just call `push_to_hub` on the `tool` variable:
-
-```python
-tool.push_to_hub("hf-model-downloads")
-```
-
-You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it.
-
-#### Having the agent use the tool
-
-We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
-
-```python
-from transformers import load_tool
-
-tool = load_tool("lysandre/hf-model-downloads")
-```
-
-In order to use it in the agent, simply pass it in the `additional_tools` parameter of the agent initialization method:
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run(
-    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
-)
-```
-which outputs the following:
-```text
-==Code generated by the agent==
-model = model_download_counter(task="text-to-video")
-print(f"The model with the most downloads is {model}.")
-audio_model = text_reader(model)
-
-
-==Result==
-The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
-```
-
-and generates the following audio.
-
-| **Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-
-
-<Tip>
-
-Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. Having a well-defined
-name and description of the tool is paramount to having it be leveraged by the agent.
-
-</Tip>
-
-### Replacing existing tools
-
-Replacing existing tools can be done simply by assigning a new item to the agent's toolbox. Here's how one would do so:
-
-```python
-from transformers import HfAgent, load_tool
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
-```
-
-<Tip>
-
-Beware when replacing tools with others! This will also adjust the agent's prompt. This can be good if you have a better
-prompt suited for the task, but it can also result in your tool being selected way more than others or for other
-tools to be selected instead of the one you have defined.
-
-</Tip>
-
-## Leveraging gradio-tools
-
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
-Face Spaces as tools. It supports many existing Spaces as well as custom Spaces to be designed with it.
-
-We offer support for `gradio_tools` by using the `Tool.from_gradio` method. For example, we want to take
-advantage of the `StableDiffusionPromptGeneratorTool` tool offered in the `gradio-tools` toolkit so as to
-improve our prompts and generate better images.
-
-We first import the tool from `gradio_tools` and instantiate it:
-
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-
-gradio_tool = StableDiffusionPromptGeneratorTool()
-```
-
-We pass that instance to the `Tool.from_gradio` method:
-
-```python
-from transformers import Tool
-
-tool = Tool.from_gradio(gradio_tool)
-```
-
-Now we can manage it exactly as we would a usual custom tool. We leverage it to improve our prompt
-` a rabbit wearing a space suit`:
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
-```
-
-The model adequately leverages the tool:
-```text
-==Explanation from the agent==
-I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
-
-
-==Code generated by the agent==
-improved_prompt = StableDiffusionPromptGenerator(prompt)
-print(f"The improved prompt is {improved_prompt}.")
-image = image_generator(improved_prompt)
-```
-
-Before finally generating the image:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
-
-<Tip warning={true}>
-
-gradio-tools requires *textual* inputs and outputs, even when working with different modalities. This implementation
-works with image and audio objects. The two are currently incompatible, but will rapidly become compatible as we
-work to improve the support.
-
-</Tip>
-
-## Future compatibility with Langchain
-
-We love Langchain and think it has a very compelling suite of tools. In order to handle these tools,
-Langchain requires *textual* inputs and outputs, even when working with different modalities.
-This is often the serialized version (i.e., saved to disk) of the objects.
-
-This difference means that multi-modality isn't handled between transformers-agents and langchain.
-We aim for this limitation to be resolved in future versions, and welcome any help from avid langchain
-users to help us achieve this compatibility.
-
-We would love to have better support. If you would like to help, please 
-[open an issue](https://github.com/huggingface/transformers/issues/new) and share what you have in mind.
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -21,7 +21,7 @@ more. It also plays a role in a variety of mixed-modality applications that have
 and vision-to-text. Some of the models that can generate text include
 GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper.

-Check out a few examples that use [`~transformers.generation_utils.GenerationMixin.generate`] method to produce
+Check out a few examples that use [`~generation.GenerationMixin.generate`] method to produce
 text outputs for different tasks:
 * [Text summarization](./tasks/summarization#inference)
 * [Image captioning](./model_doc/git#transformers.GitForCausalLM.forward.example)
@ -173,6 +173,55 @@ your screen, one word at a time:
 An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
 ```

+
+## Watermarking
+
+The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green". 
+When generating the "green" will have a small 'bias' value added to their logits, thus having a higher chance to be generated.
+The watermarked text can be detected by calculating the proportion of "green" tokens in the text and estimating how likely it is
+statistically to obtain that amount of "green" tokens for human-generated text. This watermarking strategy was proposed in the paper 
+["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634). For more information on 
+the inner functioning of watermarking, it is recommended to refer to the paper.
+
+The watermarking can be used with any generative model in `tranformers` and does not require an extra classification model
+to detect watermarked text. To trigger watermarking, pass in a [`WatermarkingConfig`] with needed arguments directly to the
+`.generate()` method or add it to the [`GenerationConfig`]. Watermarked text can be later detected with a [`WatermarkDetector`].
+
+
+<Tip warning={true}>
+
+The WatermarkDetector internally relies on the proportion of "green" tokens, and whether generated text follows the coloring pattern.
+That is why it is recommended to strip off the prompt text, if it is much longer than the generated text.
+This also can have an effect when one sequence in the batch is a lot longer causing other rows to be padded.
+Additionally, the detector **must** be initiated with identical watermark configuration arguments used when generating.
+
+</Tip>
+
+Let's generate some text with watermarking. In the below code snippet, we set the bias to 2.5 which is a value that
+will be added to "green" tokens' logits. After generating watermarked text, we can pass it directly to the `WatermarkDetector`
+to check if the text is machine-generated (outputs `True` for machine-generated and `False` otherwise).
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig
+
+>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+>>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
+>>> tok.pad_token_id = tok.eos_token_id
+>>> tok.padding_side = "left"
+
+>>> inputs = tok(["This is the beginning of a long story", "Alice and Bob are"], padding=True, return_tensors="pt")
+>>> input_len = inputs["input_ids"].shape[-1]
+
+>>> watermarking_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash")
+>>> out = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=False, max_length=20)
+
+>>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config)
+>>> detection_out = detector(out, return_dict=True)
+>>> detection_out.prediction
+array([True, True])
+```
+
+
 ## Decoding strategies

 Certain combinations of the `generate()` parameters, and ultimately `generation_config`, can be used to enable specific
--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@ -0,0 +1,96 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GGUF and interaction with Transformers
+
+The GGUF file format is used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and other 
+libraries that depend on it, like the very popular [llama.cpp](https://github.com/ggerganov/llama.cpp) or 
+[whisper.cpp](https://github.com/ggerganov/whisper.cpp).
+
+It is a file format [supported by the Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf) with features 
+allowing for quick inspection of tensors and metadata within the file.
+
+This file format is designed as a "single-file-format" where a single file usually contains both the configuration
+attributes, the tokenizer vocabulary and other attributes, as well as all tensors to be loaded in the model. These
+files come in different formats according to the quantization type of the file. We briefly go over some of them
+[here](https://huggingface.co/docs/hub/en/gguf#quantization-types).
+
+## Support within Transformers
+
+We have added the ability to load `gguf` files within `transformers` in order to offer further training/fine-tuning
+capabilities to gguf models, before converting back those models to `gguf` to use within the `ggml` ecosystem. When
+loading a model, we first dequantize it to fp32, before loading the weights to be used in PyTorch.
+
+> [!NOTE]
+> The support is still very exploratory and we welcome contributions in order to solidify it across quantization types
+> and model architectures.
+
+For now, here are the supported model architectures and quantization types:
+
+### Supported quantization types
+
+The initial supported quantization types are decided according to the popular quantized files that have been shared
+on the Hub.
+
+- F32
+- Q2_K
+- Q3_K
+- Q4_0
+- Q4_K
+- Q5_K
+- Q6_K
+- Q8_0
+
+We take example from the excellent [99991/pygguf](https://github.com/99991/pygguf) Python parser to dequantize the 
+weights.
+
+### Supported model architectures
+
+For now the supported model architectures are the architectures that have been very popular on the Hub, namely:
+
+- LLaMa
+- Mistral
+
+## Example usage
+
+In order to load `gguf` files in `transformers`, you should specify the `gguf_file` argument to the `from_pretrained`
+methods of both tokenizers and models. Here is how one would load a tokenizer and a model, which can be loaded
+from the exact same file:
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
+model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
+```
+
+Now you have access to the full, unquantized version of the model in the PyTorch ecosystem, where you can combine it
+with a plethora of other tools.
+
+In order to convert back to a `gguf` file, we recommend using the 
+[`convert-hf-to-gguf.py` file](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) from llama.cpp.
+
+Here's how you would complete the script above to save the model and export it back to `gguf`:
+
+```py
+tokenizer.save_pretrained('directory')
+model.save_pretrained('directory')
+
+!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}
+```
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -160,12 +160,13 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
 |                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
 |                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
-|                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ❌         |      ❌      |
+|                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ✅         |      ❌      |
 |                      [Idefics2](model_doc/idefics2)                      |       ✅        |         ❌         |      ❌      |
 |                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
 |                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
 |                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
 |                         [Jamba](model_doc/jamba)                         |       ✅        |         ❌         |      ❌      |
+|                        [JetMoe](model_doc/jetmoe)                        |       ✅        |         ❌         |      ❌      |
 |                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
 |                      [KOSMOS-2](model_doc/kosmos-2)                      |       ✅        |         ❌         |      ❌      |
 |                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
@ -229,6 +230,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
 |                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
 |                         [OWLv2](model_doc/owlv2)                         |       ✅        |         ❌         |      ❌      |
+|                     [PaliGemma](model_doc/paligemma)                     |       ✅        |         ❌         |      ❌      |
 |                  [PatchTSMixer](model_doc/patchtsmixer)                  |       ✅        |         ❌         |      ❌      |
 |                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
 |                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
@ -302,6 +304,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [UnivNet](model_doc/univnet)                       |       ✅        |         ❌         |      ❌      |
 |                       [UPerNet](model_doc/upernet)                       |       ✅        |         ❌         |      ❌      |
 |                           [VAN](model_doc/van)                           |       ✅        |         ❌         |      ❌      |
+|                   [VideoLlava](model_doc/video_llava)                    |       ✅        |         ❌         |      ❌      |
 |                      [VideoMAE](model_doc/videomae)                      |       ✅        |         ❌         |      ❌      |
 |                          [ViLT](model_doc/vilt)                          |       ✅        |         ❌         |      ❌      |
 |                      [VipLlava](model_doc/vipllava)                      |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -167,6 +167,9 @@ generation.
 [[autodoc]] MinNewTokensLengthLogitsProcessor
    - __call__

+[[autodoc]] MinPLogitsWarper
+    - __call__
+
 [[autodoc]] NoBadWordsLogitsProcessor
    - __call__

@ -206,6 +209,10 @@ generation.
 [[autodoc]] WhisperTimeStampLogitsProcessor
    - __call__

+[[autodoc]] WatermarkLogitsProcessor
+    - __call__
+
+
 ### TensorFlow

 [[autodoc]] TFForcedBOSTokenLogitsProcessor
@ -310,6 +317,12 @@ A [`StoppingCriteria`] can be used to change when to stop generation (other than
 [[autodoc]] MaxTimeCriteria
    - __call__

+[[autodoc]] StopStringCriteria
+    - __call__
+
+[[autodoc]] EosTokenCriteria
+    - __call__
+
 ## Constraints

 A [`Constraint`] can be used to force the generation to include specific tokens or sequences in the output. Please note that this is exclusively available to our PyTorch implementations.
@ -362,3 +375,11 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 [[autodoc]] StaticCache
    - update
    - get_seq_length
+    - reorder_cache
+
+
+## Watermark Utils
+
+[[autodoc]] WatermarkDetector
+    - __call__
+
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -65,13 +65,12 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
 ```

+Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. However, if the batch size or the maximum output length increase between calls, the cache will have to be reinitialized, triggering a new compilation.
+
 </hfoption>
-<hfoption id="setup_cache">
+<hfoption id="Static Cache">

-> [!WARNING]
-> The `_setup_cache` method is an internal and private method that is still under development. This means it may not be backward compatible and the API design may change in the future.
-
-The `_setup_cache` method doesn't support [`~GenerationMixin.generate`] yet, so this method is a bit more involved. You'll need to write your own function to decode the next token given the current token and position and cache position of previously generated tokens.
+A [`StaticCache`] object can be passed to the model's forward pass under the `past_key_values` argument, enabling the use of this object as a static kv-cache. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. You can also pass the [`StaticCache`] object to [`~GenerationMixin.generate`] and use it across calls, like you would do with a dynamic cache.

 ```py
 from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
@ -90,17 +89,22 @@ tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token
 model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential")
 inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)

-def decode_one_tokens(model, cur_token, input_pos, cache_position):
+def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):
    logits = model(
-        cur_token, position_ids=input_pos, cache_position=cache_position, return_dict=False, use_cache=True
+        cur_token,
+        position_ids=input_pos,
+        cache_position=cache_position,
+        past_key_values=past_key_values,
+        return_dict=False,
+        use_cache=True
    )[0]
    new_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
    return new_token
 ```

-There are a few important things you must do to enable static kv-cache and torch.compile with the `_setup_cache` method:
+There are a few important things you must do to enable static kv-cache and torch.compile with the `StaticCache` method:

-1. Access the model's `_setup_cache` method and pass it the [`StaticCache`] class. This is a more flexible method because it allows you to configure parameters like the maximum batch size and sequence length.
+1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.

 2. Call torch.compile on the model to compile the forward pass with the static kv-cache.

@ -109,24 +113,28 @@ There are a few important things you must do to enable static kv-cache and torch
 ```py
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
-     model._setup_cache(StaticCache, 2, max_cache_len=4096)
-     cache_position = torch.arange(seq_length, device=torch_device)
-     generated_ids = torch.zeros(
-         batch_size, seq_length + NUM_TOKENS_TO_GENERATE + 1, dtype=torch.int, device=torch_device
-     )
-     generated_ids[:, cache_position] = inputs["input_ids"].to(torch_device).to(torch.int)
+    past_key_values = StaticCache(
+        config=model.config, max_batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
+    )
+    cache_position = torch.arange(seq_length, device=torch_device)
+    generated_ids = torch.zeros(
+        batch_size, seq_length + NUM_TOKENS_TO_GENERATE + 1, dtype=torch.int, device=torch_device
+    )
+    generated_ids[:, cache_position] = inputs["input_ids"].to(torch_device).to(torch.int)

-     logits = model(**inputs, cache_position=cache_position, return_dict=False, use_cache=True)[0]
-     next_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
-     generated_ids[:, seq_length] = next_token[:, 0]
+    logits = model(
+        **inputs, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True
+    )[0]
+    next_token = torch.argmax(logits[:, -1], dim=-1)[:, None]
+    generated_ids[:, seq_length] = next_token[:, 0]

-     decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True)
-     cache_position = torch.tensor([seq_length + 1], device=torch_device)
-     for _ in range(1, NUM_TOKENS_TO_GENERATE):
-         with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
-             next_token = decode_one_tokens(model, next_token.clone(), None, cache_position)
-             generated_ids[:, cache_position] = next_token.int()
-         cache_position += 1
+    decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True)
+    cache_position = torch.tensor([seq_length + 1], device=torch_device)
+    for _ in range(1, NUM_TOKENS_TO_GENERATE):
+        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
+            next_token = decode_one_tokens(model, next_token.clone(), None, cache_position, past_key_values)
+            generated_ids[:, cache_position] = next_token.int()
+        cache_position += 1

 text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 text
@ -134,6 +142,9 @@ text
 'My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p']
 ```

+> [!TIP]
+> If you want to reuse the [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method
+
 </hfoption>
 </hfoptions>

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -247,10 +247,11 @@ While the autoregressive generation process is relatively straightforward, makin

 ### Advanced generate usage

-1. [Guide](generation_strategies) on how to control different generation methods, how to set up the generation configuration file, and how to stream the output;
-2. [Guide](chat_templating) on the prompt template for chat LLMs;
-3. [Guide](tasks/prompting) on to get the most of prompt design;
-4. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils). Most of the classes, including the logits processors, have usage examples!
+1. Guide on how to [control different generation methods](generation_strategies), how to set up the generation configuration file, and how to stream the output;
+2. [Accelerating text generation](llm_optims);
+3. [Prompt templates for chat LLMs](chat_templating);
+4. [Prompt design guide](tasks/prompting);
+5. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils). Most of the classes, including the logits processors, have usage examples!

 ### LLM leaderboards

@ -259,10 +260,12 @@ While the autoregressive generation process is relatively straightforward, makin

 ### Latency, throughput and memory utilization

-1. [Guide](llm_tutorial_optimization) on how to optimize LLMs for speed and memory;
-2. [Guide](main_classes/quantization) on quantization such as bitsandbytes and autogptq, which shows you how to drastically reduce your memory requirements.
+1. Guide on how to [optimize LLMs for speed and memory](llm_tutorial_optimization);
+2. Guide on [quantization](main_classes/quantization) such as bitsandbytes and autogptq, which shows you how to drastically reduce your memory requirements.

 ### Related libraries

-1. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
-2. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices.
+1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices.
+2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files);
+3. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
+4. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation;
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@ -28,30 +28,27 @@ contains the API docs for the underlying classes.

 ## Agents

-We provide three types of agents: [`HfAgent`] uses inference endpoints for opensource models, [`LocalAgent`] uses a model of your choice locally and [`OpenAiAgent`] uses OpenAI closed models.
-
-### HfAgent
-
-[[autodoc]] HfAgent
-
-### LocalAgent
-
-[[autodoc]] LocalAgent
-
-### OpenAiAgent
-
-[[autodoc]] OpenAiAgent
-
-### AzureOpenAiAgent
-
-[[autodoc]] AzureOpenAiAgent
+We provide two types of agents, based on the main [`Agent`] class:
+- [`CodeAgent`] acts in one shot, generating code to solve the task, then executes it at once.
+- [`ReactAgent`] acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes:
+  - [`ReactJsonAgent`] writes its tool calls in JSON.
+  - [`ReactCodeAgent`] writes its tool calls in Python code.

 ### Agent

 [[autodoc]] Agent
-    - chat
-    - run
-    - prepare_for_new_chat
+
+### CodeAgent
+
+[[autodoc]] CodeAgent
+
+### React agents
+
+[[autodoc]] ReactAgent
+
+[[autodoc]] ReactJsonAgent
+
+[[autodoc]] ReactCodeAgent

 ## Tools

@ -63,18 +60,50 @@ We provide three types of agents: [`HfAgent`] uses inference endpoints for opens

 [[autodoc]] Tool

+### Toolbox
+
+[[autodoc]] Toolbox
+
 ### PipelineTool

 [[autodoc]] PipelineTool

-### RemoteTool
-
-[[autodoc]] RemoteTool
-
 ### launch_gradio_demo

 [[autodoc]] launch_gradio_demo

+### ToolCollection
+
+[[autodoc]] ToolCollection
+
+## Engines
+
+You're free to create and use your own engines to be usable by the Agents framework.
+These engines have the following specification:
+1. Follow the [messages format](../chat_templating.md) for its input (`List[Dict[str, str]]`) and return a string.
+2. Stop generating outputs *before* the sequences passed in the argument `stop_sequences`
+
+### HfEngine
+
+For convenience, we have added a `HfEngine` that implements the points above and uses an inference endpoint for the execution of the LLM.
+
+```python
+>>> from transformers import HfEngine
+
+>>> messages = [
+...   {"role": "user", "content": "Hello, how are you?"},
+...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...   {"role": "user", "content": "No need to help, take it easy."},
+... ]
+
+>>> HfEngine()(messages, stop_sequences=["conversation"])
+
+"That's very kind of you to say! It's always nice to have a relaxed "
+```
+
+[[autodoc]] HfEngine
+
+
 ## Agent Types

 Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return
@ -94,12 +123,12 @@ These types have three specific purposes:

 ### AgentText

-[[autodoc]] transformers.tools.agent_types.AgentText
+[[autodoc]] transformers.agents.agent_types.AgentText

 ### AgentImage

-[[autodoc]] transformers.tools.agent_types.AgentImage
+[[autodoc]] transformers.agents.agent_types.AgentImage

 ### AgentAudio

-[[autodoc]] transformers.tools.agent_types.AgentAudio
+[[autodoc]] transformers.agents.agent_types.AgentAudio
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -52,3 +52,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## HfQuantizer

 [[autodoc]] quantizers.base.HfQuantizer
+
+## HqqConfig
+
+[[autodoc]] HqqConfig
--- a/docs/source/en/main_classes/text_generation.md
+++ b/docs/source/en/main_classes/text_generation.md
@ -41,6 +41,8 @@ like token streaming.
 	- validate
 	- get_generation_mode

+[[autodoc]] generation.WatermarkingConfig
+
 ## GenerationMixin

 [[autodoc]] generation.GenerationMixin
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.md
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md
@ -43,6 +43,34 @@ the authors compute the stats for a downstream dataset.
 - Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the
 [PSLA paper](https://arxiv.org/abs/2102.01243)) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task.

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import ASTForAudioClassification
+model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `MIT/ast-finetuned-audioset-10-10-0.4593` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                        27 |                                         6 |                      4.5 |
+|            2 |                                        12 |                                         6 |                      2   |
+|            4 |                                        21 |                                         8 |                      2.62 |
+|            8 |                                        40 |                                        14 |                      2.86 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with the Audio Spectrogram Transformer.
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -24,7 +24,7 @@ The abstract from the paper is the following:

 *We release Code Llama, a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. We provide multiple flavors to cover a wide range of applications: foundation models (Code Llama), Python specializations (Code Llama - Python), and instruction-following models (Code Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained on sequences of 16k tokens and show improvements on inputs with up to 100k tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support infilling based on surrounding content. Code Llama reaches state-of-the-art performance among open models on several code benchmarks, with scores of up to 53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform every other publicly available model on MultiPL-E. We release Code Llama under a permissive license that allows for both research and commercial use.*

-Check out all Code Llama model checkpoints [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [codellama org](https://huggingface.co/codellama).
+Check out all Code Llama model checkpoints [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [Meta Llama org](https://huggingface.co/meta-llama).

 This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).

@ -62,8 +62,8 @@ After conversion, the model and tokenizer can be loaded via:
 ```python
 >>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer

->>> tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
->>> model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf")
+>>> tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+>>> model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
 >>> PROMPT = '''def remove_non_ascii(s: str) -> str:
 ...     """ <FILL_ME>
 ...     return result
@ -95,7 +95,7 @@ If you only want the infilled part:
 >>> from transformers import pipeline
 >>> import torch

->>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
+>>> generator = pipeline("text-generation",model="meta-llama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
 >>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128)
 [{'generated_text': 'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return resultRemove non-ASCII characters from a string. """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c'}]
 ```
--- a/docs/source/en/model_doc/conditional_detr.md
+++ b/docs/source/en/model_doc/conditional_detr.md
@ -33,7 +33,8 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o

 ## Resources

- [Object detection task guide](../tasks/object_detection)
+- Scripts for finetuning [`ConditionalDetrForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
+- See also: [Object detection task guide](../tasks/object_detection).

 ## ConditionalDetrConfig

--- a/docs/source/en/model_doc/deformable_detr.md
+++ b/docs/source/en/model_doc/deformable_detr.md
@ -43,6 +43,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 <PipelineTag pipeline="object-detection"/>

 - Demo notebooks regarding inference + fine-tuning on a custom dataset for [`DeformableDetrForObjectDetection`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Deformable-DETR).
+- Scripts for finetuning [`DeformableDetrForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
 - See also: [Object detection task guide](../tasks/object_detection).

 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
--- a/docs/source/en/model_doc/deit.md
+++ b/docs/source/en/model_doc/deit.md
@ -68,6 +68,34 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tenso
  *facebook/deit-base-patch16-384*. Note that one should use [`DeiTImageProcessor`] in order to
  prepare images for the model.

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import DeiTForImageClassification
+model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `facebook/deit-base-distilled-patch16-224` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                         8 |                                         6 |                      1.33 |
+|            2 |                                         9 |                                         6 |                      1.5  |
+|            4 |                                         9 |                                         6 |                      1.5  |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeiT.
--- a/docs/source/en/model_doc/deta.md
+++ b/docs/source/en/model_doc/deta.md
@ -39,7 +39,8 @@ The original code can be found [here](https://github.com/jozhang97/DETA).
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETA.

 - Demo notebooks for DETA can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETA).
- See also: [Object detection task guide](../tasks/object_detection)
+- Scripts for finetuning [`DetaForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
+- See also: [Object detection task guide](../tasks/object_detection).

 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@ -162,8 +162,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h

 <PipelineTag pipeline="object-detection"/>

- All example notebooks illustrating fine-tuning [`DetrForObjectDetection`] and [`DetrForSegmentation`] on a custom dataset an be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR).
- See also: [Object detection task guide](../tasks/object_detection)
+- All example notebooks illustrating fine-tuning [`DetrForObjectDetection`] and [`DetrForSegmentation`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETR).
+- Scripts for finetuning [`DetrForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
+- See also: [Object detection task guide](../tasks/object_detection).

 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

--- a/docs/source/en/model_doc/idefics.md
+++ b/docs/source/en/model_doc/idefics.md
@ -52,6 +52,16 @@ To train a new IDEFICS model from scratch use the m4 codebase (a link will be pr
 [[autodoc]] IdeficsForVisionText2Text
    - forward

+## TFIdeficsModel
+
+[[autodoc]] TFIdeficsModel
+    - call
+
+## TFIdeficsForVisionText2Text
+
+[[autodoc]] TFIdeficsForVisionText2Text
+    - call
+
 ## IdeficsImageProcessor

 [[autodoc]] IdeficsImageProcessor
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@ -18,8 +18,7 @@ rendered properly in your Markdown viewer.

 ## Overview

-The Idefics2 model was created by the [Hugging Face M4](https://huggingface.co/HuggingFaceM4) team and authored by Léo Tronchon, Hugo Laurencon, Victor Sanh.
-The accompanying blog post can be found [here](https://huggingface.co/blog/idefics2).
+The Idefics2 model was proposed in [What matters when building vision-language models?](https://arxiv.org/abs/2405.02246) by Léo Tronchon, Hugo Laurencon, Victor Sanh. The accompanying blog post can be found [here](https://huggingface.co/blog/idefics2).

 Idefics2 is an open multimodal model that accepts arbitrary sequences of image and text inputs and produces text
 outputs. The model can answer questions about images, describe visual content, create stories grounded on multiple
@ -27,17 +26,34 @@ images, or simply behave as a pure language model without visual inputs. It impr
 document understanding, OCR, or visual reasoning. Idefics2 is lightweight (8 billion parameters) and treats
 images in their native aspect ratio and resolution, which allows for varying inference efficiency.

-Tips:
+The abstract from the paper is the following:
+
+*The growing interest in vision-language models (VLMs) has been driven by improvements in large language models and vision transformers. Despite the abundance of literature on this subject, we observe that critical decisions regarding the design of VLMs are often not justified. We argue that these unsupported decisions impede progress in the field by making it difficult to identify which choices improve model performance. To address this issue, we conduct extensive experiments around pre-trained models, architecture choice, data, and training methods. Our consolidation of findings includes the development of Idefics2, an efficient foundational VLM of 8 billion parameters. Idefics2 achieves state-of-the-art performance within its size category across various multimodal benchmarks, and is often on par with models four times its size. We release the model (base, instructed, and chat) along with the datasets created for its training.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/idefics2_architecture.png"
+alt="drawing" width="600"/>
+
+<small> Idefics2 architecture. Taken from the <a href="https://arxiv.org/abs/2405.02246">original paper.</a> </small>
+
+This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
+The original code can be found [here](https://huggingface.co/HuggingFaceM4/idefics2).
+
+## Usage tips
+
 - Each sample can contain multiple images, and the number of images can vary between samples. The processor will pad the inputs to the maximum number of images in a batch for input to the model.
 - The processor has a `do_image_splitting` option. If `True`, each input image will be split into 4 sub-images, and concatenated with the original to form 5 images. This is useful for increasing model performance. Make sure `processor.image_processor.do_image_splitting` is set to `False` if the model was not trained with this option.
 - `text` passed to the processor should have the `<image>` tokens where the images should be inserted. And `<end_of_utterance>` at the end of each utterance if the text is a chat message.
 - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as `text` to the processor.

 Example of how to use the processor on chat messages:
+
 ```python
 import requests
 from PIL import Image
 from transformers import Idefics2Processor, Idefics2ForConditionalGeneration
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"

 url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
 url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
@ -57,19 +73,71 @@ messages = [{

 processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b")
 model = Idefics2ForConditionalGeneration.from_pretrained("HuggingFaceM4/idefics2-8b")
+model.to(device)

-text = processor.apply_chat_template(messages)
-# "User: What’s the difference between these two images?<image><image><end_of_utterance>\n"
+# at inference time, one needs to pass `add_generation_prompt=True` in order to make sure the model completes the prompt
+text = processor.apply_chat_template(messages, add_generation_prompt=True)
 print(text)
+# 'User: What’s the difference between these two images?<image><image><end_of_utterance>\nAssistant:'

-inputs = processor(images=images, text=text)
+inputs = processor(images=images, text=text, return_tensors="pt").to(device)

-generated_text = model.generate(**inputs)
+generated_text = model.generate(**inputs, max_new_tokens=500)
+generated_text = processor.batch_decode(generated_text, skip_special_tokens=True)[0]
+print("Generated text:", generated_text)
 ```

-This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
-The original code can be found [here](https://huggingface.co/HuggingFaceM4/idefics2).
+## Model optimizations: Flash Attention

+The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). Make also sure to load your model in half-precision (e.g. `torch.float16`)
+
+To load and run a model using Flash Attention-2, simply change the code snippet above with the following change:
+
+```diff
+model = Idefics2ForConditionalGeneration.from_pretrained(
+    "HuggingFaceM4/idefics2-8b",
+    torch_dtype=torch.float16,    
+    attn_implementation="flash_attention_2",
+).to(device)
+```
+
+## Shrinking down Idefics2 using quantization
+
+As the Idefics2 model has 8 billion parameters, that would require about 16GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), that requires only about 3.5GB of RAM.
+
+Quantizing a model is as simple as passing a `quantization_config` to the model. One can change the code snippet above with the changes below. We'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization.md) for other quantization methods):
+
+```diff
+ from transformers import BitsAndBytesConfig
+
+ quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.float16
+ )
+model = Idefics2ForConditionalGeneration.from_pretrained(
+    "HuggingFaceM4/idefics2-8b",
+    torch_dtype=torch.float16,    
+    quantization_config=quantization_config,
+).to(device)
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Idefics2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- A notebook on how to fine-tune Idefics2 on a custom dataset using the [Trainer](../main_classes/trainer.md) can be found [here](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing). It supports both full fine-tuning as well as (quantized) LoRa.
+- A script regarding how to fine-tune Idefics2 using the TRL library can be found [here](https://gist.github.com/edbeeching/228652fc6c2b29a1641be5a5778223cb).
+- Demo notebook regarding fine-tuning Idefics2 for JSON extraction use cases can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Idefics2). 🌎

 ## Idefics2Config

@ -95,4 +163,4 @@ The original code can be found [here](https://huggingface.co/HuggingFaceM4/idefi

 ## Idefics2Processor
 [[autodoc]] Idefics2Processor
-    - __call__
+    - __call__
--- a/docs/source/en/model_doc/jetmoe.md
+++ b/docs/source/en/model_doc/jetmoe.md
@ -0,0 +1,49 @@
+<!--Copyright 2024 JetMoe team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# JetMoe
+
+## Overview
+
+**JetMoe-8B** is an 8B Mixture-of-Experts (MoE) language model developed by [Yikang Shen](https://scholar.google.com.hk/citations?user=qff5rRYAAAAJ) and [MyShell](https://myshell.ai/).
+JetMoe project aims to provide a LLaMA2-level performance and efficient language model with a limited budget.
+To achieve this goal, JetMoe uses a sparsely activated architecture inspired by the [ModuleFormer](https://arxiv.org/abs/2306.04640). 
+Each JetMoe block consists of two MoE layers: Mixture of Attention Heads and Mixture of MLP Experts.
+Given the input tokens, it activates a subset of its experts to process them.
+This sparse activation schema enables JetMoe to achieve much better training throughput than similar size dense models. 
+The training throughput of JetMoe-8B is around 100B tokens per day on a cluster of 96 H100 GPUs with a straightforward 3-way pipeline parallelism strategy.
+
+This model was contributed by [Yikang Shen](https://huggingface.co/YikangS).
+
+
+## JetMoeConfig
+
+[[autodoc]] JetMoeConfig
+
+## JetMoeModel
+
+[[autodoc]] JetMoeModel
+    - forward
+
+## JetMoeForCausalLM
+
+[[autodoc]] JetMoeForCausalLM
+    - forward
+
+## JetMoeForSequenceClassification
+
+[[autodoc]] JetMoeForSequenceClassification
+    - forward
--- a/docs/source/en/model_doc/llama3.md
+++ b/docs/source/en/model_doc/llama3.md
@ -82,4 +82,4 @@ pipeline("Hey how are you doing today?")
 ```

 ## Resources
-A ton of cool resources are already available on the documentation page of [~llama2], inviting contributors to add new recourses curated for Llama3 here! 🤗
+A ton of cool resources are already available on the documentation page of [~llama2], inviting contributors to add new resources curated for Llama3 here! 🤗
--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@ -64,8 +64,8 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
 >>> for box, score, label in zip(boxes, scores, labels):
 ...     box = [round(i, 2) for i in box.tolist()]
 ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
-Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.32, 278.51]
-Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85]
+Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
+Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
 ```

 ## Resources
--- a/docs/source/en/model_doc/paligemma.md
+++ b/docs/source/en/model_doc/paligemma.md
@ -0,0 +1,38 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PaliGemma
+
+## Overview
+
+The PaliGemma model was proposed by Google. It is a 3B VLM composed by a Siglip-400m vision encoder and a Gemma-2B decoder linked by a multimodal linear projection. It is not a chat model with images. It cuts an image into a fixed number of VIT tokens and prepends it to an optional prompt. One particularity is that the model uses full block attention on all the image tokens plus the input text tokens. It comes in 3 resolutions, 224x224, 448x448 and 896x896 with 3 base models, with 55 fine-tuned versions for different tasks, and 2 mix models.
+
+
+This model was contributed by [Molbap](https://huggingface.co/Molbap).
+
+
+## PaliGemmaConfig
+
+[[autodoc]] PaliGemmaConfig
+
+## PaliGemmaProcessor
+
+[[autodoc]] PaliGemmaProcessor
+
+## PaliGemmaForConditionalGeneration
+
+[[autodoc]] PaliGemmaForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->
@ -51,13 +51,16 @@ Phi-3 has been integrated in the development version (4.40.0.dev) of `transforme
 >>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
 >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

->>> messages = [{"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}]
+>>> messages = [{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}]
 >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")

 >>> outputs = model.generate(inputs, max_new_tokens=32)
 >>> text = tokenizer.batch_decode(outputs)[0]
 >>> print(text)
-<s><|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Absolutely! Bananas and dragonfruits are both delicious fruits that can be combined in various ways to create tasty and nutrit
+<s><|user|> 
+Can you provide ways to eat combinations of bananas and dragonfruits?<|end|> 
+<|assistant|> 
+Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for eating combinations of bananas and
 ```

 ## Phi3Config
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -0,0 +1,129 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Video-LLaVA
+
+## Overview
+
+Video-LLaVa is an open-source multimodal LLM trained by fine-tuning LlamA/Vicuna on multimodal instruction-following data generated by Llava1.5 and VideChat. It is an auto-regressive language model, based on the transformer architecture. Video-LLaVa unifies visual representations to the language feature space, and enables an LLM to perform visual reasoning capabilities on both images and videos simultaneously.
+
+
+The Video-LLaVA model was proposed in [Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122) by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munang Ning, Peng Jin, Li Yuan.
+
+The abstract from the paper is the following:
+
+*The Large Vision-Language Model (LVLM) has enhanced the performance of various downstream tasks in
+visual-language understanding. Most existing approaches
+encode images and videos into separate feature spaces,
+which are then fed as inputs to large language models.
+However, due to the lack of unified tokenization for images and videos, namely misalignment before projection, it
+becomes challenging for a Large Language Model (LLM)
+to learn multi-modal interactions from several poor projection layers. In this work, we unify visual representation into the language feature space to advance the foundational LLM towards a unified LVLM. As a result, we establish a simple but robust LVLM baseline, Video-LLaVA,
+which learns from a mixed dataset of images and videos,
+mutually enhancing each other. Video-LLaVA achieves superior performances on a broad range of 9 image benchmarks across 5 image question-answering datasets and 4
+image benchmark toolkits. Additionally, our Video-LLaVA
+also outperforms Video-ChatGPT by 5.8%, 9.9%, 18.6%,
+and 10.1% on MSRVTT, MSVD, TGIF, and ActivityNet, respectively. Notably, extensive experiments demonstrate that
+Video-LLaVA mutually benefits images and videos within
+a unified visual representation, outperforming models designed specifically for images or videos. We aim for this
+work to provide modest insights into the multi-modal inputs
+for the LLM*
+
+Tips:
+
+- We advise users to use padding_side="left" when computing batched generation as it leads to more accurate results. Simply make sure to call processor.tokenizer.padding_side = "left" before generating.
+
+- Note the model has not been explicitly trained to process multiple images/videos in the same prompt, although this is technically possible, you may experience inaccurate results.
+
+- For better results, we recommend users prompt the model with the correct prompt format:
+
+
+```python
+import av
+import torch
+import numpy as np
+import requests
+from PIL import Image
+from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor
+
+def read_video_pyav(container, indices):
+    '''
+    Decode the video with PyAV decoder.
+    Args:
+        container (`av.container.input.InputContainer`): PyAV container.
+        indices (`List[int]`): List of frame indices to decode.
+    Returns:
+        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+    '''
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", device_map="auto")
+processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
+
+video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
+
+container = av.open(video_path)
+total_frames = container.streams.video[0].frames
+indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+video = read_video_pyav(container, indices)
+
+prompt = "USER: <video>Why is this funny? ASSISTANT:"
+inputs = processor(text=prompt, videos=video, return_tensors="pt")
+
+out = model.generate(**inputs, max_new_tokens=40)
+print(processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))
+```
+
+For multiple turns conversation change the prompt to:
+
+```bash
+"USER: <video>What do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:"
+```
+
+- Note that the video inputs should have exactly 8 frames at the input, since the models were trained in that setting.
+
+
+
+This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
+The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA).
+
+
+## VideoLlavaConfig
+
+[[autodoc]] VideoLlavaConfig
+
+## VideoLlavaImageProcessor
+
+[[autodoc]] VideoLlavaImageProcessor
+
+## VideoLlavaProcessor
+
+[[autodoc]] VideoLlavaProcessor
+
+## VideoLlavaForConditionalGeneration
+
+[[autodoc]] VideoLlavaForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/videomae.md
+++ b/docs/source/en/model_doc/videomae.md
@ -33,6 +33,34 @@ alt="drawing" width="600"/>
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/MCG-NJU/VideoMAE).

+## Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import VideoMAEForVideoClassification
+model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `MCG-NJU/videomae-base-finetuned-kinetics` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                        37 |                                        10 |                      3.7  |
+|            2 |                                        24 |                                        18 |                      1.33 |
+|            4 |                                        43 |                                        32 |                      1.34 |
+|            8 |                                        84 |                                        60 |                      1.4  |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VideoMAE. If
--- a/docs/source/en/model_doc/vit.md
+++ b/docs/source/en/model_doc/vit.md
@ -88,6 +88,34 @@ who already converted the weights from JAX to PyTorch. Credits go to him!
  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import ViTForImageClassification
+model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                         7 |                                         6 |                      1.17 |
+|            2 |                                         8 |                                         6 |                      1.33 |
+|            4 |                                         8 |                                         6 |                      1.33 |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
 ## Resources

 Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
--- a/docs/source/en/model_doc/vit_hybrid.md
+++ b/docs/source/en/model_doc/vit_hybrid.md
@ -39,6 +39,34 @@ substantially fewer computational resources to train.*
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
 found [here](https://github.com/google-research/vision_transformer).

+## Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import ViTHybridForImageClassification
+model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-hybrid-base-bit-384` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                        29 |                                        18 |                      1.61 |
+|            2 |                                        26 |                                        18 |                      1.44 |
+|            4 |                                        25 |                                        18 |                      1.39 |
+|            8 |                                        34 |                                        24 |                      1.42 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
--- a/docs/source/en/model_doc/vit_mae.md
+++ b/docs/source/en/model_doc/vit_mae.md
@ -52,6 +52,34 @@ consists of Transformer blocks) takes as input. Each mask token is a shared, lea
 sin/cos position embeddings are added both to the input of the encoder and the decoder.
 - For a visual understanding of how MAEs work you can check out this [post](https://keras.io/examples/vision/masked_image_modeling/).

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import ViTMAEModel
+model = ViTMAEModel.from_pretrained("facebook/vit-mae-base", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `facebook/vit-mae-base` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                        11 |                                         6 |                      1.83 |
+|            2 |                                         8 |                                         6 |                      1.33 |
+|            4 |                                         8 |                                         6 |                      1.33 |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMAE.
--- a/docs/source/en/model_doc/vit_msn.md
+++ b/docs/source/en/model_doc/vit_msn.md
@ -49,6 +49,34 @@ use the [`ViTMSNForImageClassification`] class which is initialized from [`ViTMS
 - MSN is particularly useful in the low-shot and extreme low-shot regimes. Notably, it achieves 75.7% top-1 accuracy with only 1% of ImageNet-1K
 labels when fine-tuned.

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import ViTMSNForImageClassification
+model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-base", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `facebook/vit-msn-base` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                         7 |                                         6 |                      1.17 |
+|            2 |                                         8 |                                         6 |                      1.33 |
+|            4 |                                         8 |                                         6 |                      1.33 |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT MSN.
--- a/docs/source/en/model_doc/yolos.md
+++ b/docs/source/en/model_doc/yolos.md
@ -32,6 +32,34 @@ alt="drawing" width="600"/>

 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS).

+## Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import AutoModelForObjectDetection
+model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-base", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `hustvl/yolos-base` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                       106 |                                        76 |                      1.39 |
+|            2 |                                       154 |                                        90 |                      1.71 |
+|            4 |                                       222 |                                       116 |                      1.91 |
+|            8 |                                       368 |                                       168 |                      2.19 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with YOLOS.
@ -39,6 +67,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 <PipelineTag pipeline="object-detection"/>

 - All example notebooks illustrating inference + fine-tuning [`YolosForObjectDetection`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS).
+- Scripts for finetuning [`YolosForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
 - See also: [Object detection task guide](../tasks/object_detection)

 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -50,11 +50,13 @@ FlashAttention-2 is currently supported for the following architectures:
 * [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#transformers.GPTJModel)
 * [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
+* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
 * [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
 * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
 * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
+* [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
 * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)
 * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
@ -94,7 +96,7 @@ We strongly suggest referring to the detailed [installation instructions](https:
 </hfoption>
 <hfoption id="AMD">

-FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210** and **Instinct MI250**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.
+FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210**, **Instinct MI250** and **Instinct MI300**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.

 </hfoption>
 </hfoptions>
@ -190,17 +192,21 @@ FlashAttention is more memory efficient, meaning you can train on much larger se
 PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available. You may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.

 For now, Transformers supports SDPA inference and training for the following architectures:
+* [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel)
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
+* [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
+* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
+* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
 * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
@ -212,12 +218,18 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
+* [ViT](https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTModel)
+* [ViTHybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid#transformers.ViTHybridModel)
+* [ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae#transformers.ViTMAEModel)
+* [ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn#transformers.ViTMSNModel)
+* [VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae#transformers.VideoMAEModell)
 * [wav2vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
 * [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
 * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
 * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
+* [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel)


 <Tip>
--- a/docs/source/en/quantization.md
+++ b/docs/source/en/quantization.md
@ -642,6 +642,27 @@ double_quant_config = BitsAndBytesConfig(
 model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config)
 ```

+### Dequantizing `bitsandbytes` models
+
+Once quantized, you can dequantize the model to the original precision. Note this might result in a small quality loss of the model. Make also sure to have enough GPU RAM to fit the dequantized model. 
+Below is how to perform dequantization on a 4-bit model using `bitsandbytes`.
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
+
+model_id = "facebook/opt-125m"
+
+model = AutoModelForCausalLM.from_pretrained(model_id, BitsAndBytesConfig(load_in_4bit=True))
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+model.dequantize()
+
+text = tokenizer("Hello my name is", return_tensors="pt").to(0)
+
+out = model.generate(**text)
+print(tokenizer.decode(out[0]))
+```
+
 ## EETQ
 The [EETQ](https://github.com/NetEase-FuXi/EETQ) library supports int8 per-channel weight-only quantization for NVIDIA GPUS. The high-performance GEMM and GEMV kernels are from FasterTransformer and TensorRT-LLM. It requires no calibration dataset and does not need to pre-quantize your model. Moreover, the accuracy degradation is negligible owing to the per-channel quantization. 

@ -745,3 +766,53 @@ The speed and throughput of fused and unfused modules were also tested with the
    <figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
  </div>
 </div>
+
+## HQQ 
+Half-Quadratic Quantization (HQQ) implements on-the-fly quantization via fast robust optimization. It doesn't require calibration data and can be used to quantize any model.  
+Please refer to the <a href="https://github.com/mobiusml/hqq/">official package</a> for more details.
+
+For installation, we recommend you use the following approach to get the latest version and build its corresponding CUDA kernels:
+```
+pip install hqq
+```
+
+To quantize a model, you need to create an [`HqqConfig`]. There are two ways of doing it:
+``` Python
+from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
+
+# Method 1: all linear layers will use the same quantization config
+quant_config  = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) #axis=0 is used by default
+```
+
+``` Python
+# Method 2: each linear layer with the same tag will use a dedicated quantization config
+q4_config = {'nbits':4, 'group_size':64, 'quant_zero':False, 'quant_scale':False}
+q3_config = {'nbits':3, 'group_size':32, 'quant_zero':False, 'quant_scale':False}
+quant_config  = HqqConfig(dynamic_config={
+  'self_attn.q_proj':q4_config,
+  'self_attn.k_proj':q4_config,
+  'self_attn.v_proj':q4_config,
+  'self_attn.o_proj':q4_config,
+
+  'mlp.gate_proj':q3_config,
+  'mlp.up_proj'  :q3_config,
+  'mlp.down_proj':q3_config,
+})
+```
+
+The second approach is especially interesting for quantizing Mixture-of-Experts (MoEs) because the experts are less affected by lower quantization settings.
+
+
+Then you simply quantize the model as follows
+``` Python
+model = transformers.AutoModelForCausalLM.from_pretrained(
+    model_id, 
+    torch_dtype=torch.float16, 
+    device_map="cuda", 
+    quantization_config=quant_config
+)
+```
+### Optimized Runtime
+HQQ supports various backends, including pure Pytorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training.
+For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090.
+For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@ -38,7 +38,7 @@ To see all architectures and checkpoints compatible with this task, we recommend
 Before you begin, make sure you have all the necessary libraries installed:

 ```bash
-pip install transformers datasets evaluate accelerate
+pip install transformers datasets evaluate accelerate pillow torchvision scikit-learn
 ```

 We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in:
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@ -382,7 +382,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
 ```

-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to generate text.
+Use the [`~generation.GenerationMixin.generate`] method to generate text.
 For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](../generation_strategies) page.

 ```py
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@ -60,7 +60,7 @@ image
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/segmentation_input.jpg" alt="Segmentation Input"/>
 </div>

-We will use [nvidia/segformer-b1-finetuned-cityscapes-1024-1024](https://huggingface.co/nvidia/segformer-b1-finetuned-cityscapes-1024-1024). 
+We will use [nvidia/segformer-b1-finetuned-cityscapes-1024-1024](https://huggingface.co/nvidia/segformer-b1-finetuned-cityscapes-1024-1024).

 ```python
 semantic_segmentation = pipeline("image-segmentation", "nvidia/segformer-b1-finetuned-cityscapes-1024-1024")
@ -68,7 +68,7 @@ results = semantic_segmentation(image)
 results
 ```

-The segmentation pipeline output includes a mask for every predicted class. 
+The segmentation pipeline output includes a mask for every predicted class.
 ```bash
 [{'score': None,
  'label': 'road',
@ -111,11 +111,11 @@ results[-1]["mask"]
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/semantic_segmentation_output.png" alt="Semantic Segmentation Output"/>
 </div>

-In instance segmentation, the goal is not to classify every pixel, but to predict a mask for **every instance of an object** in a given image. It works very similar to object detection, where there is a bounding box for every instance, there's a segmentation mask instead. We will use [facebook/mask2former-swin-large-cityscapes-instance](https://huggingface.co/facebook/mask2former-swin-large-cityscapes-instance) for this. 
+In instance segmentation, the goal is not to classify every pixel, but to predict a mask for **every instance of an object** in a given image. It works very similar to object detection, where there is a bounding box for every instance, there's a segmentation mask instead. We will use [facebook/mask2former-swin-large-cityscapes-instance](https://huggingface.co/facebook/mask2former-swin-large-cityscapes-instance) for this.

 ```python
 instance_segmentation = pipeline("image-segmentation", "facebook/mask2former-swin-large-cityscapes-instance")
-results = instance_segmentation(Image.open(image))
+results = instance_segmentation(image)
 results
 ```

@ -148,7 +148,7 @@ Panoptic segmentation combines semantic segmentation and instance segmentation,

 ```python
 panoptic_segmentation = pipeline("image-segmentation", "facebook/mask2former-swin-large-cityscapes-panoptic")
-results = panoptic_segmentation(Image.open(image))
+results = panoptic_segmentation(image)
 results
 ```
 As you can see below, we have more classes. We will later illustrate to see that every pixel is classified into one of the classes.
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@ -355,7 +355,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+Use the [`~generation.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.

 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@ -364,7 +364,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+Use the [`~generation.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.

 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
--- a/docs/source/en/tasks/zero_shot_image_classification.md
+++ b/docs/source/en/tasks/zero_shot_image_classification.md
@ -39,7 +39,7 @@ In this guide you'll learn how to:
 Before you begin, make sure you have all the necessary libraries installed:

 ```bash
-pip install -q transformers
+pip install -q "transformers[torch]" pillow
 ```

 ## Zero-shot image classification pipeline
--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@ -186,6 +186,7 @@ so we can just convert that directly to a NumPy array without tokenization!

 ```py
 from transformers import AutoTokenizer
+import numpy as np

 tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
--- a/docs/source/en/transformers_agents.md
+++ b/docs/source/en/transformers_agents.md
@ -1,323 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Transformers Agents
-
-<Tip warning={true}>
-
-Transformers Agents is an experimental API which is subject to change at any time. Results returned by the agents
-can vary as the APIs or underlying models are prone to change.
-
-</Tip>
-
-Transformers version v4.29.0, building on the concept of *tools* and *agents*. You can play with in
-[this colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
-
-In short, it provides a natural language API on top of transformers: we define a set of curated tools and design an 
-agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools, 
-but we'll show you how the system can be extended easily to use any tool developed by the community.
-
-Let's start with a few examples of what can be achieved with this new API. It is particularly powerful when it comes 
-to multimodal tasks, so let's take it for a spin to generate images and read text out loud.
-
-```py
-agent.run("Caption the following image", image=image)
-```
-
-| **Input**                                                                                                                   | **Output**                        |
-|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
-| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
-
---
-
-```py
-agent.run("Read the following text out loud", text=text)
-```
-| **Input**                                                                                                               | **Output**                                   |
-|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
-| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
-
---
-
-```py
-agent.run(
-    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
-    document=document,
-)
-```
-| **Input**                                                                                                                   | **Output**     |
-|-----------------------------------------------------------------------------------------------------------------------------|----------------|
-| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
-
-## Quickstart
-
-Before being able to use `agent.run`, you will need to instantiate an agent, which is a large language model (LLM). 
-We provide support for openAI models as well as opensource alternatives from BigCode and OpenAssistant. The openAI
-models perform better (but require you to have an openAI API key, so cannot be used for free); Hugging Face is
-providing free access to endpoints for BigCode and OpenAssistant models.
-
-To start with, please install the `agents` extras in order to install all default dependencies.
-```bash
-pip install transformers[agents]
-```
-
-To use openAI models, you instantiate an [`OpenAiAgent`] after installing the `openai` dependency:
-
-```bash
-pip install openai
-```
-
-
-```py
-from transformers import OpenAiAgent
-
-agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
-```
-
-To use BigCode or OpenAssistant, start by logging in to have access to the Inference API:
-
-```py
-from huggingface_hub import login
-
-login("<YOUR_TOKEN>")
-```
-
-Then, instantiate the agent
-
-```py
-from transformers import HfAgent
-
-# Starcoder
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-# StarcoderBase
-# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
-# OpenAssistant
-# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
-```
-
-This is using the inference API that Hugging Face provides for free at the moment. If you have your own inference
-endpoint for this model (or another one) you can replace the URL above with your URL endpoint.
-
-<Tip>
-
-StarCoder and OpenAssistant are free to use and perform admirably well on simple tasks. However, the checkpoints
-don't hold up when handling more complex prompts. If you're facing such an issue, we recommend trying out the OpenAI
-model which, while sadly not open-source, performs better at this given time.
-
-</Tip>
-
-You're now good to go! Let's dive into the two APIs that you now have at your disposal.
-
-### Single execution (run)
-
-The single execution method is when using the [`~Agent.run`] method of the agent:
-
-```py
-agent.run("Draw me a picture of rivers and lakes.")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
-
-It automatically selects the tool (or tools) appropriate for the task you want to perform and runs them appropriately. It
-can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely
-the agent is to fail).
-
-```py
-agent.run("Draw me a picture of the sea then transform the picture to add an island")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
-
-<br/>
-
-
-Every [`~Agent.run`] operation is independent, so you can run it several times in a row with different tasks.
-
-Note that your `agent` is just a large-language model, so small variations in your prompt might yield completely
-different results. It's important to explain as clearly as possible the task you want to perform. We go more in-depth
-on how to write good prompts [here](custom_tools#writing-good-user-inputs).
-
-If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying
-variables that you would like the agent to use. For example, you could generate the first image of rivers and lakes, 
-and ask the model to update that picture to add an island by doing the following:
-
-```python
-picture = agent.run("Generate a picture of rivers and lakes.")
-updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
-```
-
-<Tip>
-
-This can be helpful when the model is unable to understand your request and mixes tools. An example would be:
-
-```py
-agent.run("Draw me the picture of a capybara swimming in the sea")
-```
-
-Here, the model could interpret in two ways:
- Have the `text-to-image` generate a capybara swimming in the sea
- Or, have the `text-to-image` generate capybara, then use the `image-transformation` tool to have it swim in the sea
-
-In case you would like to force the first scenario, you could do so by passing it the prompt as an argument:
-
-```py
-agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
-```
-
-</Tip>
-
-
-### Chat-based execution (chat)
-
-The agent also has a chat-based approach, using the [`~Agent.chat`] method:
-
-```py
-agent.chat("Generate a picture of rivers and lakes")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-```py
-agent.chat("Transform the picture so that there is a rock in there")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
-
-<br/>
-
-This is an interesting approach when you want to keep the state across instructions. It's better for experimentation, 
-but will tend to be much better at single instructions rather than complex instructions (which the [`~Agent.run`]
-method is better at handling).
-
-This method can also take arguments if you would like to pass non-text types or specific prompts.
-
-### ⚠️ Remote execution
-
-For demonstration purposes and so that it could be used with all setups, we had created remote executors for several 
-of the default tools the agent has access for the release. These are created using 
-[inference endpoints](https://huggingface.co/inference-endpoints).
-
-We have turned these off for now, but in order to see how to set up remote executors tools yourself,
-we recommend reading the [custom tool guide](./custom_tools).
-
-### What's happening here? What are tools, and what are agents?
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
-
-#### Agents
-
-The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools.
-
-LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the 
-LLM gives a small sample of code performing a task with a set of tools. This prompt is then completed by the 
-task you give your agent and the description of the tools you give it. This way it gets access to the doc of the 
-tools you are using, especially their expected inputs and outputs, and can generate the relevant code.
-
-#### Tools
-
-Tools are very simple: they're a single function, with a name, and a description. We then use these tools' descriptions 
-to prompt the agent. Through the prompt, we show the agent how it would leverage tools to perform what was 
-requested in the query.
-
-This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools. 
-Pipelines are more refactored and often combine several tasks in one. Tools are meant to be focused on
-one very simple task only.
-
-#### Code-execution?!
-
-This code is then executed with our small Python interpreter on the set of inputs passed along with your tools. 
-We hear you screaming "Arbitrary code execution!" in the back, but let us explain why that is not the case.
-
-The only functions that can be called are the tools you provided and the print function, so you're already 
-limited in what can be executed. You should be safe if it's limited to Hugging Face tools. 
-
-Then, we don't allow any attribute lookup or imports (which shouldn't be needed anyway for passing along 
-inputs/outputs to a small set of functions) so all the most obvious attacks (and you'd need to prompt the LLM 
-to output them anyway) shouldn't be an issue. If you want to be on the super safe side, you can execute the 
-run() method with the additional argument return_code=True, in which case the agent will just return the code 
-to execute and you can decide whether to do it or not.
-
-The execution will stop at any line trying to perform an illegal operation or if there is a regular Python error 
-with the code generated by the agent.
-
-### A curated set of tools
-
-We identify a set of tools that can empower such agents. Here is an updated list of the tools we have integrated 
-in `transformers`:
-
- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](./model_doc/flan-t5))
- **Unconditional image captioning**: Caption the image! ([BLIP](./model_doc/blip))
- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](./model_doc/clipseg))
- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](./model_doc/bart))
- **Text summarization**: summarize a long text in one or a few sentences ([BART](./model_doc/bart))
- **Translation**: translate the text into a given language ([NLLB](./model_doc/nllb))
-
-These tools have an integration in transformers, and can be used manually as well, for example:
-
-```py
-from transformers import load_tool
-
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
-```
-
-### Custom tools
-
-While we identify a curated set of tools, we strongly believe that the main value provided by this implementation is 
-the ability to quickly create and share custom tools.
-
-By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool 
-directly with the agent. We've added a few 
-**transformers-agnostic** tools to the [`huggingface-tools` organization](https://huggingface.co/huggingface-tools):
-
- **Text downloader**: to download a text from a web URL
- **Text to image**: generate an image according to a prompt, leveraging stable diffusion
- **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion
- **Text to video**: generate a small video according to a prompt, leveraging damo-vilab
-
-The text-to-image tool we have been using since the beginning is a remote tool that lives in 
-[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
-continue releasing such tools on this and other organizations, to further supercharge this implementation.
-
-The agents have by default access to tools that reside on [`huggingface-tools`](https://huggingface.co/huggingface-tools).
-We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
-
-### Code generation
-
-So far we have shown how to use the agents to perform actions for you. However, the agent is only generating code
-that we then execute using a very restricted Python interpreter. In case you would like to use the code generated in 
-a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports.
-
-For example, the following instruction
-```python
-agent.run("Draw me a picture of rivers and lakes", return_code=True)
-```
-
-returns the following code
-
-```python
-from transformers import load_tool
-
-image_generator = load_tool("huggingface-tools/text-to-image")
-
-image = image_generator(prompt="rivers and lakes")
-```
-
-that you can then modify and execute yourself.
--- a/docs/source/es/converting_tensorflow_models.md
+++ b/docs/source/es/converting_tensorflow_models.md
@ -89,7 +89,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entre
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights

-transformers-cli convert --model_type openai-community/gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
--- a/docs/source/es/pipeline_tutorial.md
+++ b/docs/source/es/pipeline_tutorial.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Pipelines para inferencia

-Un [`pipeline`] simplifica el uso de cualquier modelo del [Model Hub](https://huggingface.co/models) para la inferencia en una variedad de tareas como la generación de texto, la segmentación de imágenes y la clasificación de audio. Incluso si no tienes experiencia con una modalidad específica o no comprendes el código que alimenta los modelos, ¡aún puedes usarlos con el [`pipeline`]! Este tutorial te enseñará a:
+Un [`pipeline`] simplifica el uso de cualquier modelo del [Hub](https://huggingface.co/models) para la inferencia en una variedad de tareas como la generación de texto, la segmentación de imágenes y la clasificación de audio. Incluso si no tienes experiencia con una modalidad específica o no comprendes el código que alimenta los modelos, ¡aún puedes usarlos con el [`pipeline`]! Este tutorial te enseñará a:

 * Utilizar un [`pipeline`] para inferencia.
 * Utilizar un tokenizador o modelo específico.
@ -30,114 +30,295 @@ Echa un vistazo a la documentación de [`pipeline`] para obtener una lista compl

 ## Uso del pipeline

-Si bien cada tarea tiene un [`pipeline`] asociado, es más sencillo usar la abstracción general [`pipeline`] que contiene todos los pipelines de tareas específicas. El [`pipeline`] carga automáticamente un modelo predeterminado y un tokenizador con capacidad de inferencia para tu tarea.
+Si bien cada tarea tiene un [`pipeline`] asociado, es más sencillo usar la abstracción general [`pipeline`] que contiene todos los pipelines de tareas específicas. El [`pipeline`] carga automáticamente un modelo predeterminado y un tokenizador con capacidad de inferencia para tu tarea. Veamos el ejemplo de usar un [`pipeline`] para reconocimiento automático del habla (ASR), o texto a voz.

 1. Comienza creando un [`pipeline`] y específica una tarea de inferencia:

 ```py
 >>> from transformers import pipeline

->>> generator = pipeline(task="text-generation")
+>>> transcriber = pipeline(task="automatic-speech-recognition")
 ```

-2. Pasa tu texto de entrada al [`pipeline`]:
+2. Pasa tu entrada a la [`pipeline`]. En el caso del reconocimiento del habla, esto es un archivo de entrada de audio:

 ```py
->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
 ```

-Si tienes más de una entrada, pásala como una lista:
+¿No es el resultado que tenías en mente? Echa un vistazo a algunos de los [modelos de reconocimiento automático del habla más descargados](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) 
+en el Hub para ver si puedes obtener una mejor transcripción.
+
+Intentemos con el modelo [Whisper large-v2](https://huggingface.co/openai/whisper-large) de OpenAI. Whisper se lanzó 
+2 años después que Wav2Vec2, y se entrenó con cerca de 10 veces más datos. Como tal, supera a Wav2Vec2 en la mayoría de las pruebas 
+downstream. También tiene el beneficio adicional de predecir puntuación y mayúsculas, ninguno de los cuales es posible con  
+Wav2Vec2.
+
+Vamos a probarlo aquí para ver cómo se desempeña:

 ```py
->>> generator(
-...     [
-...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
-...     ]
-... )
+>>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
 ```

-Cualquier parámetro adicional para tu tarea también se puede incluir en el [`pipeline`]. La tarea `text-generation` tiene un método [`~generation.GenerationMixin.generate`] con varios parámetros para controlar la salida. Por ejemplo, si deseas generar más de una salida, defínelo en el parámetro `num_return_sequences`:
+¡Ahora este resultado parece más preciso! Para una comparación detallada de Wav2Vec2 vs Whisper, consulta el [Curso de Transformers de Audio](https://huggingface.co/learn/audio-course/chapter5/asr_models).
+Realmente te animamos a que eches un vistazo al Hub para modelos en diferentes idiomas, modelos especializados en tu campo, y más.
+Puedes comparar directamente los resultados de los modelos desde tu navegador en el Hub para ver si se adapta o 
+maneja casos de borde mejor que otros.
+Y si no encuentras un modelo para tu caso de uso, siempre puedes empezar a [entrenar](training) el tuyo propio.
+
+Si tienes varias entradas, puedes pasar tu entrada como una lista:

 ```py
->>> generator(
-...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
-...     num_return_sequences=2,
-... )
+transcriber(
+    [
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+    ]
+)
 ```

-### Selecciona un modelo y un tokenizador
+Los pipelines son ideales para la experimentación, ya que cambiar de un modelo a otro es trivial; sin embargo, hay algunas formas de optimizarlas para cargas de trabajo más grandes que la experimentación. Consulta las siguientes guías que profundizan en iterar sobre conjuntos de datos completos o utilizar pipelines en un servidor web:
+de la documentación:

-El [`pipeline`] acepta cualquier modelo del [Model Hub](https://huggingface.co/models). Hay etiquetas en el Model Hub que te permiten filtrar por el modelo que te gustaría utilizar para tu tarea. Una vez que hayas elegido un modelo apropiado, cárgalo con la clase `AutoModelFor` y [`AutoTokenizer`] correspondientes. Por ejemplo, carga la clase [`AutoModelForCausalLM`] para una tarea de modelado de lenguaje causal:
+* [Uso de pipelines en un conjunto de datos](#uso-de-pipelines-en-un-conjunto-de-datos)
+* [Uso de pipelines para un servidor web](./pipeline_webserver)
+
+## Parámetros
+
+[`pipeline`] admite muchos parámetros; algunos son específicos de la tarea y algunos son generales para todas las pipelines. En general, puedes especificar parámetros en cualquier lugar que desees:

 ```py
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
+transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)

->>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
->>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+out = transcriber(...)  # This will use `my_parameter=1`.
+out = transcriber(..., my_parameter=2)  # This will override and use `my_parameter=2`.
+out = transcriber(...)  # This will go back to using `my_parameter=1`.
 ```

-Crea un [`pipeline`] para tu tarea y específica el modelo y el tokenizador que cargaste:
+Vamos a echar un vistazo a tres importantes:
+
+### Device
+
+Si usas `device=n`, el pipeline automáticamente coloca el modelo en el dispositivo especificado. Esto funcionará independientemente de si estás utilizando PyTorch o Tensorflow.

 ```py
->>> from transformers import pipeline
-
->>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
+transcriber = pipeline(model="openai/whisper-large-v2", device=0)
 ```

-Pasa tu texto de entrada a [`pipeline`] para generar algo de texto:
+Si el modelo es demasiado grande para una sola GPU y estás utilizando PyTorch, puedes establecer `device_map="auto"` para determinar automáticamente cómo cargar y almacenar los pesos del modelo. Utilizar el argumento `device_map` requiere el paquete 🤗 [Accelerate](https://huggingface.co/docs/accelerate):
+
+```bash
+pip install --upgrade accelerate
+```
+
+El siguiente código carga y almacena automáticamente los pesos del modelo en varios dispositivos:

 ```py
->>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
-[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
+transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
 ```

-## Pipeline de audio
+Tenga en cuenta que si se pasa `device_map="auto"`, no es necesario agregar el argumento `device=device` al instanciar tu `pipeline`, ¡ya que podrías encontrar algún comportamiento inesperado!

-La flexibilidad de [`pipeline`] significa que también se puede extender a tareas de audio.
+### Batch size

-Por ejemplo, clasifiquemos la emoción de un breve fragmento del famoso discurso de John F. Kennedy ["We choose to go to the Moon"](https://en.wikipedia.org/wiki/We_choose_to_go_to_the_Moon). Encuentra un modelo de [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) para reconocimiento de emociones en el Model Hub y cárgalo en el [`pipeline`]:
+Por defecto, los pipelines no realizarán inferencia por lotes por razones explicadas en detalle [aquí](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). La razón es que la agrupación en lotes no es necesariamente más rápida y, de hecho, puede ser bastante más lenta en algunos casos.
+
+Pero si funciona en tu caso de uso, puedes utilizar:

 ```py
->>> from transformers import pipeline
-
->>> audio_classifier = pipeline(
-...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
+transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
+audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
+texts = transcriber(audio_filenames)
 ```

-Pasa el archivo de audio al [`pipeline`]:
+Esto ejecuta el pipeline en los 4 archivos de audio proporcionados, pero los pasará en lotes de a 2 al modelo (que está en una GPU, donde la agrupación en lotes es más probable que ayude) sin requerir ningún código adicional de tu parte. La salida siempre debería coincidir con lo que habrías recibido sin agrupación en lotes. Solo se pretende como una forma de ayudarte a obtener más velocidad de una pipeline.
+
+Los pipelines también pueden aliviar algunas de las complejidades de la agrupación en lotes porque, para algunos pipelines, un solo elemento (como un archivo de audio largo) necesita ser dividido en varias partes para ser procesado por un modelo. El pipeline realiza esta [*agrupación en lotes de fragmentos*](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-chunk-batching) por ti.
+
+### Task specific parameters
+
+Todas las tareas proporcionan parámetros específicos de la tarea que permiten flexibilidad adicional y opciones para ayudarte a completar tu trabajo. Por ejemplo, el método [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] tiene un parámetro `return_timestamps` que suena prometedor para subtítulos de videos:

 ```py
->>> audio_classifier("jfk_moon_speech.wav")
-[{'label': 'calm', 'score': 0.13856211304664612},
- {'label': 'disgust', 'score': 0.13148026168346405},
- {'label': 'happy', 'score': 0.12635163962841034},
- {'label': 'angry', 'score': 0.12439591437578201},
- {'label': 'fearful', 'score': 0.12404385954141617}]
+>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
 ```

+Como puedes ver, el modelo infirió el texto y también salió **cuándo** se pronunciaron las distintas oraciones.
+
+Hay muchos parámetros disponibles para cada tarea, así que echa un vistazo a la referencia de la API de cada tarea para ver qué puedes ajustar. Por ejemplo, el [`~transformers.AutomaticSpeechRecognitionPipeline`] tiene un parámetro `chunk_length_s` que es útil para trabajar con archivos de audio realmente largos (por ejemplo, subtítulos de películas completas o videos de una hora de duración) que un modelo típicamente no puede manejar solo:
+
+```python
+>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30)
+>>> transcriber("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav")
+{'text': " So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know. You get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light and I'd bump it up"}
+```
+
+¡Si no puedes encontrar un parámetro que te ayude, no dudes en [solicitarlo](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
+
+## Uso de pipelines en un conjunto de datos
+
+Los pipeline también puede ejecutar inferencia en un conjunto de datos grande. La forma más fácil que recomendamos para hacer esto es utilizando un iterador:
+
+```py
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
+
+
+pipe = pipeline(model="openai-community/gpt2", device=0)
+generated_characters = 0
+for out in pipe(data()):
+    generated_characters += len(out[0]["generated_text"])
+```
+
+El iterador `data()` produce cada resultado, y el pipeline automáticamente
+reconoce que la entrada es iterable y comenzará a buscar los datos mientras
+continúa procesándolos en la GPU (dicho proceso utiliza [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)). Esto es importante porque no tienes que asignar memoria para todo el conjunto de datos y puedes alimentar la GPU lo más rápido posible.
+
+Dado que la agrupación en lotes podría acelerar las cosas, puede ser útil intentar ajustar el parámetro `batch_size` aquí.
+
+La forma más sencilla de iterar sobre un conjunto de datos es cargandolo desde 🤗 [Datasets](https://github.com/huggingface/datasets/):
+
+```py
+# KeyDataset is a util that will just output the item we're interested in.
+from transformers.pipelines.pt_utils import KeyDataset
+from datasets import load_dataset
+
+pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
+dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
+
+for out in pipe(KeyDataset(dataset, "audio")):
+    print(out)
+```
+
+## Uso de pipelines para un servidor web
+
+<Tip>
+Crear un motor de inferencia es un tema complejo que merece su propia página.
+</Tip>
+
+[Link](./pipeline_webserver)
+
 ## Pipeline de visión

-Finalmente, utilizar un [`pipeline`] para tareas de visión es prácticamente igual.
+Usar un [`pipeline`] para tareas de visión es prácticamente idéntico.

-Específica tu tarea de visión y pasa tu imagen al clasificador. La imagen puede ser un enlace o una ruta local a la imagen. Por ejemplo, ¿qué especie de gato se muestra a continuación?
+Especifica tu tarea y pasa tu imagen al clasificador. La imagen puede ser un enlace, una ruta local o una imagen codificada en base64. Por ejemplo, ¿qué especie de gato se muestra a continuación?

 ![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)

 ```py
 >>> from transformers import pipeline

->>> vision_classifier = pipeline(task="image-classification")
->>> vision_classifier(
+>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
+>>> preds = vision_classifier(
 ...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
 ... )
-[{'label': 'lynx, catamount', 'score': 0.4403027892112732},
- {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
-  'score': 0.03433405980467796},
- {'label': 'snow leopard, ounce, Panthera uncia',
-  'score': 0.032148055732250214},
- {'label': 'Egyptian cat', 'score': 0.02353910356760025},
- {'label': 'tiger cat', 'score': 0.023034192621707916}]
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
 ```
+
+## Pipeline de texto
+
+Usar un [`pipeline`] para tareas de PLN es prácticamente idéntico.
+
+```py
+>>> from transformers import pipeline
+
+>>> # This model is a `zero-shot-classification` model.
+>>> # It will classify text, except you are free to choose any label you might imagine
+>>> classifier = pipeline(model="facebook/bart-large-mnli")
+>>> classifier(
+...     "I have a problem with my iphone that needs to be resolved asap!!",
+...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+... )
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+
+## Pipeline multimodal
+
+[`pipeline`] admite más de una modalidad. Por ejemplo, una tarea de respuesta a preguntas visuales (VQA) combina texto e imagen. No dudes en usar cualquier enlace de imagen que desees y una pregunta que quieras hacer sobre la imagen. La imagen puede ser una URL o una ruta local a la imagen.
+
+Por ejemplo, si usas esta [imagen de factura](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png):
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(model="impira/layoutlm-document-qa")
+>>> output = vqa(
+...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+...     question="What is the invoice number?",
+... )
+>>> output[0]["score"] = round(output[0]["score"], 3)
+>>> output
+[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
+```
+
+<Tip>
+
+Para ejecutar el ejemplo anterior, debe tener instalado [`pytesseract`](https://pypi.org/project/pytesseract/) además de 🤗 Transformers:
+
+```bash
+sudo apt install -y tesseract-ocr
+pip install pytesseract
+```
+
+</Tip>
+
+## Uso de `pipeline` en modelos grandes con 🤗 `accelerate`:
+
+¡Puedes ejecutar fácilmente `pipeline` en modelos grandes utilizando 🤗 `accelerate`! Primero asegúrate de haber instalado `accelerate` con `pip install accelerate`. 
+
+¡Luego carga tu modelo utilizando `device_map="auto"`! Utilizaremos `facebook/opt-1.3b` para nuestro ejemplo.
+
+```py
+# pip install accelerate
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+También puedes pasar modelos cargados de 8 bits sí instalas `bitsandbytes` y agregas el argumento `load_in_8bit=True`
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+Nota que puedes reemplazar el punto de control con cualquier modelo de Hugging Face que admita la carga de modelos grandes, como BLOOM.
+
+## Crear demos web desde pipelines con `gradio`
+
+Los pipelines están automáticamente soportadas en [Gradio](https://github.com/gradio-app/gradio/), una biblioteca que hace que crear aplicaciones de aprendizaje automático hermosas y fáciles de usar en la web sea un proceso sencillo. Primero, asegúrate de tener Gradio instalado:
+
+```
+pip install gradio
+```
+
+Luego, puedes crear una demo web alrededor de una pipeline de clasificación de imágenes (o cualquier otra pipeline) en una sola línea de código llamando a la función `Interface.from_pipeline` de Gradio para lanzar la pipeline. Esto crea una interfaz intuitiva *drag-and-drop* en tu navegador:
+
+```py
+from transformers import pipeline
+import gradio as gr
+
+pipe = pipeline("image-classification", model="google/vit-base-patch16-224")
+
+gr.Interface.from_pipeline(pipe).launch()
+```
+
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/panda-classification.png)
+
+De forma predeterminada, la demo web se ejecuta en un servidor local. Si deseas compartirlo con otros, puedes generar un enlace público temporal estableciendo `share=True` en `launch()`. También puedes hospedar tu demo en [Hugging Face Spaces](https://huggingface.co/spaces) para un enlace permanente.
--- a/docs/source/es/pipeline_webserver.md
+++ b/docs/source/es/pipeline_webserver.md
@ -9,13 +9,7 @@ Crear un motor de inferencia es un tema complejo, y la "mejor" solución probabl
 </Tip>


-Lo fundamental para entender es que podemos usar un iterador, tal como [en un conjunto de datos](https://huggingface.co/docs/transformers/pipeline_tutorial#using-pipelines-on-a-dataset), ya que un servidor web es básicamente un sistema que espera solicitudes y las trata a medida que llegan.
-
-<!--
-To do:
-* Check the content of es/pipeline_tutorial.md
-* And update the link [en un conjunto de datos] -> (pipeline_tutorial#pipelines-en-un-conjunto-de-datos)
-->
+Lo fundamental para entender es que podemos usar un iterador, tal como [en un conjunto de datos](pipeline_tutorial#uso-de-pipelines-en-un-conjunto-de-datos), ya que un servidor web es básicamente un sistema que espera solicitudes y las trata a medida que llegan.

 Por lo general, los servidores web están multiplexados (multihilo, asíncrono, etc.) para manejar varias solicitudes simultáneamente. Por otro lado, los flujos de trabajo (y principalmente los modelos subyacentes) no son realmente ideales para el paralelismo; consumen mucha RAM, por lo que es mejor darles todos los recursos disponibles cuando se están ejecutando o es un trabajo intensivo en cómputo.

--- a/docs/source/it/converting_tensorflow_models.md
+++ b/docs/source/it/converting_tensorflow_models.md
@ -13,12 +13,12 @@ rendered properly in your Markdown viewer.

 # Convertire checkpoint di Tensorflow

-È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM 
+È disponibile un'interfaccia a linea di comando per convertire gli originali checkpoint di Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM
 in modelli che possono essere caricati utilizzando i metodi `from_pretrained` della libreria.

 <Tip>

-A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione 
+A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione
 di transformers >=2.3.0.

 La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**.
@ -27,19 +27,19 @@ La seguente documentazione riflette il formato dei comandi di **transformers-cli

 ## BERT

-Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare 
-[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models)) 
-in un file di salvataggio Pytorch utilizzando lo script 
+Puoi convertire qualunque checkpoint Tensorflow di BERT (in particolare
+[i modeli pre-allenati rilasciati da Google](https://github.com/google-research/bert#pre-trained-models))
+in un file di salvataggio Pytorch utilizzando lo script
 [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py).

-Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo 
+Questo CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `bert_model.ckpt`) ed il relativo
 file di configurazione (`bert_config.json`), crea un modello Pytorch per questa configurazione, carica i pesi dal
-checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che 
+checkpoint di Tensorflow nel modello di Pytorch e salva il modello che ne risulta in un file di salvataggio standard di Pytorch che
 può essere importato utilizzando `from_pretrained()` (vedi l'esempio nel
 [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).

-Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare 
-il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione 
+Devi soltanto lanciare questo script di conversione **una volta** per ottenere un modello Pytorch. Dopodichè, potrai tralasciare
+il checkpoint di Tensorflow (i tre files che iniziano con `bert_model.ckpt`), ma assicurati di tenere il file di configurazione
 (`bert_config.json`) ed il file di vocabolario (`vocab.txt`) in quanto queste componenti sono necessarie anche per il modello di Pytorch.

 Per lanciare questo specifico script di conversione avrai bisogno di un'installazione di Tensorflow e di Pytorch
@ -59,11 +59,11 @@ Puoi scaricare i modelli pre-allenati di Google per la conversione [qua](https:/

 ## ALBERT

-Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script 
+Per il modello ALBERT, converti checkpoint di Tensoflow in Pytorch utilizzando lo script
 [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py).

-Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di 
-configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione 
+Il CLI prende come input un checkpoint di Tensorflow (tre files che iniziano con `model.ckpt-best`) e i relativi file di
+configurazione (`albert_config.json`), dopodichè crea e salva un modello Pytorch. Per lanciare questa conversione
 avrai bisogno di un'installazione di Tensorflow e di Pytorch.

 Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-allenato:
@ -97,7 +97,7 @@ Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allen

 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
-transformers-cli convert --model_type openai-community/gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
--- a/docs/source/ja/custom_tools.md
+++ b/docs/source/ja/custom_tools.md
@ -16,749 +16,11 @@ rendered properly in your Markdown viewer.

 # Custom Tools and Prompts

-<Tip>
-
-トランスフォーマーのコンテキストでツールとエージェントが何であるかを知らない場合、
-まず[Transformers Agents](transformers_agents)ページをお読みいただくことをお勧めします。
-
-</Tip>
-
 <Tip warning={true}>

-Transformers Agentsは実験的なAPIであり、いつでも変更される可能性があります。
-エージェントによって返される結果は、APIや基礎となるモデルが変更される可能性があるため、変化することがあります。
+The Agents framework has significantly changed in version v4.41.0.
+This document has been removed as it was referencing an older API.
+
+We eagerly welcome new contributions for the updated API.

 </Tip>
-
-カスタムツールとプロンプトを作成し、使用することは、エージェントを強化し、新しいタスクを実行させるために非常に重要です。
-このガイドでは、以下の内容を説明します：
-
- プロンプトのカスタマイズ方法
- カスタムツールの使用方法
- カスタムツールの作成方法
-
-## Customizing the prompt
-
-[Transformers Agents](transformers_agents)で説明されているように、エージェントは[`~Agent.run`]および[`~Agent.chat`]モードで実行できます。
-`run`モードと`chat`モードの両方は同じロジックに基づいています。
-エージェントを駆動する言語モデルは、長いプロンプトに基づいて条件付けられ、
-次のトークンを生成して停止トークンに達するまでプロンプトを完了します。
-両者の唯一の違いは、`chat`モードの間にプロンプトが前のユーザーの入力とモデルの生成と共に拡張されることです。
-これにより、エージェントは過去の対話にアクセスでき、エージェントにあたかもメモリがあるかのように見えます。
-
-### Structure of the prompt
-
-プロンプトがどのように構築され、どのように最適化できるかを理解するために、プロンプトは大まかに4つの部分に分かれています。
-
-1. イントロダクション：エージェントの振る舞い、ツールの概念の説明。
-2. すべてのツールの説明。これはユーザーによって定義/選択されたツールでランタイム時に動的に置換される`<<all_tools>>`トークンによって定義されます。
-3. タスクとその解決策の一連の例。
-4. 現在の例と解決策の要求。
-
-各部分をよりよく理解するために、`run`プロンプトがどのように見えるかの簡略版を見てみましょう：
-
-````text
-タスクを実行するために、Pythonのシンプルなコマンドのシリーズを考えてくることがあるでしょう。
-[...]
-意味がある場合は、中間結果を表示することができます。
-
-ツール：
- document_qa：これはドキュメント（pdf）に関する質問に答えるツールです。情報を含むドキュメントである `document` と、ドキュメントに関する質問である `question` を受け取り、質問に対する回答を含むテキストを返します。
- image_captioner：これは画像の説明を生成するツールです。キャプションにする画像である `image` と、説明を含む英語のテキストを返すテキストを受け取ります。
-[...]
-
-タスク: "変数 `question` に関する質問に答えるための画像について回答してください。質問はフランス語です。"
-
-次のツールを使用します：質問を英語に翻訳するための `translator`、そして入力画像に関する質問に答えるための `image_qa`。
-
-回答：
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(image=image, question=translated_question)
-print(f"The answer is {answer}")
-```
-
-タスク：「`document`内で最年長の人物を特定し、その結果をバナーとして表示する。」
-
-以下のツールを使用します：`document_qa`を使用してドキュメント内で最年長の人物を見つけ、その回答に従って`image_generator`を使用して画像を生成します。
-
-回答：
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-[...]
-タスク: "川と湖の絵を描いてください"
-
-以下のものを使用します
-````
-
-導入部分（"Tools:"の前のテキスト）は、モデルの振る舞いと実行すべきタスクを正確に説明しています。
-この部分はおそらくエージェントが常に同じ方法で振る舞う必要があるため、カスタマイズする必要はありません。
-
-2番目の部分（"Tools"の下の箇条書き）は、`run`または`chat`を呼び出すたびに動的に追加されます。
-`agent.toolbox`内のツールの数と同じ数の箇条書きがあり、それぞれの箇条書きにはツールの名前と説明が含まれています。
-
-```text
- <tool.name>: <tool.description>
-```
-
-もうすぐ確認しましょう。 `document_qa` ツールを読み込んで名前と説明を出力します。
-
-```py
-from transformers import load_tool
-
-document_qa = load_tool("document-question-answering")
-print(f"- {document_qa.name}: {document_qa.description}")
-```
-
-which gives:
-```text
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
-```
-
-ツール説明:
-このツールは、2つのパートから成り立っています。最初のパートでは、ツールが何を行うかを説明し、2番目のパートでは入力引数と戻り値がどのように期待されるかを述べています。
-
-良いツール名とツールの説明は、エージェントが正しく使用するために非常に重要です。エージェントがツールについて持っている唯一の情報は、その名前と説明です。したがって、ツール名と説明の両方が正確に記述され、ツールボックス内の既存のツールのスタイルに合致することを確認する必要があります。特に、説明にはコードスタイルで名前で期待されるすべての引数が言及され、期待される型とそれらが何であるかの説明も含めるべきです。
-
-<Tip>
-
-キュレートされたTransformersツールの命名と説明を確認して、ツールがどのような名前と説明を持つべきかを理解するのに役立ちます。
-すべてのツールは[`Agent.toolbox`]プロパティで確認できます。
-
-</Tip>
-
-
-カスタマイズされた例：
-ツールの使い方をエージェントに正確に示す一連の例が含まれています。これらの例は、エージェントが実際に正確で実行可能なコードを生成する可能性を最大化するように書かれているため、非常に重要です。大規模な言語モデルは、プロンプト内のパターンを認識し、新しいデータを使用してそのパターンを繰り返すことに非常に優れています。したがって、実践で正しい実行可能なコードを生成するエージェントの可能性を最大化するように、これらの例は書かれている必要があります。
-
-以下は、一つの例です：
-
-````text
-Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-````
-
-パターン：モデルが繰り返しを行うように指示されるパターンには、3つの部分があります。
-タスクの声明、エージェントの意図した動作の説明、そして最後に生成されるコードです。
-プロンプトの一部であるすべての例には、この正確なパターンがあり、エージェントが新しいトークンを生成する際にも
-同じパターンを再現することを確認しています。
-
-プロンプトの例はTransformersチームによって厳選され、一連の問題ステートメントで厳密に評価されます。
-これにより、エージェントのプロンプトがエージェントの実際の使用ケースを解決するためにできるだけ優れたものになります。
-
-プロンプトの最後の部分に対応しています：
-
-[こちら](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)の問題ステートメントで厳密に評価される、エージェントのプロンプトができるだけ優れたものになるように
-慎重に選定されたプロンプト例を提供しています。
-
-```text
-Task: "Draw me a picture of rivers and lakes"
-
-I will use the following
-```
-
-
-これがエージェントに完成させるための最終的で未完成の例です。未完成の例は、実際のユーザー入力に基づいて動的に作成されます。上記の例では、ユーザーが次のように実行しました：
-
-```py
-agent.run("Draw me a picture of rivers and lakes")
-```
-
-ユーザーの入力 - つまり、タスク："川と湖の絵を描いてください"は、以下のようなプロンプトテンプレートに変換されます："タスク：<task> \n\n 次に私は以下を使用します"。
-この文は、エージェントが条件付けられたプロンプトの最終行を構成し、したがってエージェントに対して前の例とまったく同じ方法で例を終了するよう強く影響します。
-
-詳細には立ち入りませんが、チャットテンプレートは同じプロンプト構造を持ち、例はわずかに異なるスタイルを持っています。例：
-
-````text
-[...]
-
-=====
-
-Human: Answer the question in the variable `question` about the image stored in the variable `image`.
-
-Assistant: I will use the tool `image_qa` to answer the question on the input image.
-
-```py
-answer = image_qa(text=question, image=image)
-print(f"The answer is {answer}")
-```
-
-Human: I tried this code, it worked but didn't give me a good result. The question is in French
-
-Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
-
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(text=translated_question, image=image)
-print(f"The answer is {answer}")
-```
-
-=====
-
-[...]
-````
-
-*Human:* `run`プロンプトの例とは対照的に、各`chat`プロンプトの例には*Human*と*Assistant*の間で1つ以上のやりとりがあります。各やりとりは、`run`プロンプトの例と同様の構造になっています。ユーザーの入力は*Human:*の後ろに追加され、エージェントにはコードを生成する前に何を行う必要があるかを最初に生成するように指示されます。やりとりは以前のやりとりに基づいて行われることがあり、ユーザーが「I tried **this** code」と入力したように、以前に生成されたエージェントのコードを参照できます。
-
-*Assistant:* `.chat`を実行すると、ユーザーの入力または*タスク*が未完了の形式に変換されます：
-
-```text
-Human: <user-input>\n\nAssistant:
-```
-
-以下のエージェントが完了するコマンドについて説明します。 `run` コマンドとは対照的に、`chat` コマンドは完了した例をプロンプトに追加します。そのため、次の `chat` ターンのためにエージェントにより多くの文脈を提供します。
-
-さて、プロンプトの構造がわかったところで、どのようにカスタマイズできるかを見てみましょう！
-
-### Writing good user inputs
-
-大規模な言語モデルはユーザーの意図を理解する能力がますます向上していますが、エージェントが正しいタスクを選択するのを助けるために、できるだけ正確に記述することが非常に役立ちます。できるだけ正確であるとは何を意味するのでしょうか？
-
-エージェントは、プロンプトでツール名とその説明のリストを見ています。ツールが追加されるほど、エージェントが正しいツールを選択するのが難しくなり、正しいツールの連続を選択するのはさらに難しくなります。共通の失敗例を見てみましょう。ここではコードのみを返すことにします。
-
-
-```py
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-
-agent.run("Show me a tree", return_code=True)
-```
-
-gives:
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
-
-
-==Code generated by the agent==
-mask = image_segmenter(image, prompt="tree")
-```
-
-これはおそらく私たちが望んでいたものではないでしょう。代わりに、木の画像が生成されることがより可能性が高いです。
-特定のツールを使用するようエージェントを誘導するために、ツールの名前や説明に含まれている重要なキーワードを使用することは非常に役立ちます。さて、詳しく見てみましょう。
-
-```py
-agent.toolbox["image_generator"].description
-```
-
-```text
-'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
-```
-
-名前と説明文には、キーワード「画像」、「プロンプト」、「作成」、および「生成」が使用されています。これらの言葉を使用することで、ここでの動作がより効果的になる可能性が高いです。プロンプトを少し詳細に調整しましょう。
-
-```py
-agent.run("Create an image of a tree", return_code=True)
-```
-
-gives:
-```text
-==Explanation from the agent==
-I will use the following tool `image_generator` to generate an image of a tree.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="tree")
-```
-
-簡単に言うと、エージェントがタスクを正確に適切なツールにマッピングできない場合は、ツールの名前や説明の最も関連性のあるキーワードを調べて、タスクリクエストをそれに合わせて洗練させてみてください。
-
-
-### Customizing the tool descriptions
-
-以前にも見たように、エージェントは各ツールの名前と説明にアクセスできます。ベースのツールは非常に正確な名前と説明を持っているはずですが、特定のユースケースに合わせてツールの説明や名前を変更することが役立つかもしれません。これは、非常に類似した複数のツールを追加した場合や、特定のドメイン（たとえば、画像生成や変換など）でエージェントを使用する場合に特に重要になるかもしれません。
-
-よくある問題は、エージェントが画像生成タスクに頻繁に使用される場合、画像生成と画像変換/修正を混同することです。
-
-例：
-
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-
-returns
-```text
-==Explanation from the agent== 
-I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-house_car_image = image_transformer(image=car_image, prompt="A house")
-```
-
-これはおそらく私たちがここで望んでいる正確なものではないようです。エージェントは「image_generator」と「image_transformer」の違いを理解するのが難しいようで、しばしば両方を一緒に使用します。
-
-ここでエージェントをサポートするために、"image_transformer"のツール名と説明を変更して、少し"image"や"prompt"から切り離してみましょう。代わりにそれを「modifier」と呼びましょう：
-
-```py
-agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
-agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
-    "transforms an image according to a prompt", "modifies an image"
-)
-```
-
-「変更」は、上記のプロンプトに新しい画像プロセッサを使用する強力な手がかりです。それでは、もう一度実行してみましょう。
-
-
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-
-Now we're getting:
-```text
-==Explanation from the agent==
-I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
-
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-```
-
-これは、私たちが考えていたものに確実に近づいています！ただし、家と車を同じ画像に含めたいと考えています。タスクを単一の画像生成に向けることで、より適切な方向に進めるはずです：
-
-```py
-agent.run("Create image: 'A house and car'", return_code=True)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_generator` to generate an image.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="A house and car")
-```
-
-<Tip warning={true}>
-
-エージェントは、特に複数のオブジェクトの画像を生成するなど、やや複雑なユースケースに関しては、まだ多くのユースケースに対して脆弱です。
-エージェント自体とその基礎となるプロンプトは、今後数ヶ月でさらに改善され、さまざまなユーザーの入力に対してエージェントがより頑健になるようになります。
-
-</Tip>
-
-### Customizing the whole project
-
-ユーザーに最大限の柔軟性を提供するために、[上記](#structure-of-the-prompt)で説明されたプロンプトテンプレート全体をユーザーが上書きできます。この場合、カスタムプロンプトには導入セクション、ツールセクション、例セクション、未完了の例セクションが含まれていることを確認してください。`run` プロンプトテンプレートを上書きしたい場合、以下のように行うことができます:
-
-
-```py
-template = """ [...] """
-
-agent = HfAgent(your_endpoint, run_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-`<<all_tools>>` 文字列と `<<prompt>>` は、エージェントが使用できるツールを認識し、ユーザーのプロンプトを正しく挿入できるように、`template` のどこかに定義されていることを確認してください。
-
-</Tip>
-
-同様に、`chat` プロンプトテンプレートを上書きすることもできます。なお、`chat` モードでは常に以下の形式で交換が行われます：
-
-上記のテキストの上に日本語の翻訳を提供してください。Markdownコードとして書いてください。
-
-
-```text
-Human: <<task>>
-
-Assistant:
-```
-
-したがって、カスタム`chat`プロンプトテンプレートの例もこのフォーマットを使用することが重要です。以下のように、インスタンス化時に`chat`テンプレートを上書きできます。
-
-```python
-template = """ [...] """
-
-agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-`<<all_tools>>` という文字列が `template` 内で定義されていることを確認してください。これにより、エージェントは使用可能なツールを把握できます。
-
-</Tip>
-
-両方の場合、プロンプトテンプレートの代わりに、コミュニティの誰かがホストしたテンプレートを使用したい場合は、リポジトリIDを渡すことができます。デフォルトのプロンプトは、[このリポジトリ](https://huggingface.co/datasets/huggingface-tools/default-prompts) にありますので、参考になります。
-
-カスタムプロンプトをHubのリポジトリにアップロードしてコミュニティと共有する場合は、次のことを確認してください：
- データセットリポジトリを使用すること
- `run` コマンド用のプロンプトテンプレートを `run_prompt_template.txt` という名前のファイルに配置すること
- `chat` コマンド用のプロンプトテンプレートを `chat_prompt_template.txt` という名前のファイルに配置すること
-
-## Using custom tools
-
-このセクションでは、画像生成に特化した2つの既存のカスタムツールを利用します：
-
- [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation) をより多くの画像変更を可能にするために [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool) に置き換えます。
- 画像のアップスケーリング用の新しいツールをデフォルトのツールボックスに追加します：[diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) は既存の画像変換ツールを置き換えます。
-
-便利な [`load_tool`] 関数を使用してカスタムツールをロードします：
-
-```py
-from transformers import load_tool
-
-controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
-upscaler = load_tool("diffusers/latent-upscaler-tool")
-```
-
-エージェントにカスタムツールを追加すると、ツールの説明と名前がエージェントのプロンプトに自動的に含まれます。したがって、エージェントがカスタムツールの使用方法を理解できるように、カスタムツールには適切に記述された説明と名前が必要です。
-
-`controlnet_transformer`の説明と名前を見てみましょう。
-
-最初に、便利な[`load_tool`]関数を使用してカスタムツールをロードします。
-
-```py
-print(f"Description: '{controlnet_transformer.description}'")
-print(f"Name: '{controlnet_transformer.name}'")
-```
-
-gives 
-```text
-Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
-It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
-Name: 'image_transformer'
-```
-
-名前と説明は正確であり、[厳選されたツール](./transformers_agents#a-curated-set-of-tools)のスタイルに合っています。
-
-次に、`controlnet_transformer`と`upscaler`を使ってエージェントをインスタンス化します。
-
-```py
-tools = [controlnet_transformer, upscaler]
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
-
-```
-
-以下のコマンドは、以下の情報を提供します：
-
-
-```text
-image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
-8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
-```
-
-一連の厳選されたツールにはすでに `image_transformer` ツールがあり、これをカスタムツールで置き換えます。
-
-<Tip>
-
-既存のツールを上書きすることは、特定のタスクに既存のツールをまったく同じ目的で使用したい場合に有益であることがあります。
-なぜなら、エージェントはその特定のタスクの使用方法に精通しているからです。この場合、カスタムツールは既存のツールとまったく同じAPIに従うか、そのツールを使用するすべての例が更新されるようにプロンプトテンプレートを適応させる必要があります。
-
-</Tip>
-
-アップスケーラーツールには `image_upscaler` という名前が付けられ、これはデフォルトのツールボックスにはまだ存在しないため、単にツールのリストに追加されます。
-エージェントが現在使用可能なツールボックスを確認するには、`agent.toolbox` 属性を使用できます。
-
-```py
-print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
-```
-
-```text
- document_qa
- image_captioner
- image_qa
- image_segmenter
- transcriber
- summarizer
- text_classifier
- text_qa
- text_reader
- translator
- image_transformer
- text_downloader
- image_generator
- video_generator
- image_upscaler
-```
-
-注意: `image_upscaler` がエージェントのツールボックスの一部となったことに注目してください。
-
-それでは、新しいツールを試してみましょう！[Transformers Agents Quickstart](./transformers_agents#single-execution-run) で生成した画像を再利用します。
-
-
-```py
-from diffusers.utils import load_image
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
-)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-美しい冬の風景にこの画像を変身させましょう：
-
-```py
-image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_transformer` to transform the image.
-
-
-==Code generated by the agent==
-image = image_transformer(image, prompt="A frozen lake and snowy forest")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
-
-新しい画像処理ツールは、非常に強力な画像の変更を行うことができるControlNetに基づいています。
-デフォルトでは、画像処理ツールはサイズが512x512ピクセルの画像を返します。それを拡大できるか見てみましょう。
-
-
-```py
-image = agent.run("Upscale the image", image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_upscaler` to upscale the image.
-
-
-==Code generated by the agent==
-upscaled_image = image_upscaler(image)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
-
-
-エージェントは、プロンプト「画像の拡大」を、その説明とツールの名前だけを基に、新たに追加されたアップスケーリングツールに自動的にマッピングし、正しく実行できました。
-
-次に、新しいカスタムツールを作成する方法を見てみましょう。
-
-### Adding new tools
-
-このセクションでは、エージェントに追加できる新しいツールの作成方法を示します。
-
-#### Creating a new tool
-
-まず、ツールの作成から始めましょう。次のコードで、特定のタスクに関してHugging Face Hubで最もダウンロードされたモデルを取得する、あまり役立たないけれども楽しいタスクを追加します。
-
-以下のコードでそれを行うことができます：
-
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-
-タスク `text-classification` の場合、これは `'facebook/bart-large-mnli'` を返します。`translation` の場合、`'google-t5/t5-base'` を返します。
-
-これをエージェントが利用できるツールに変換する方法は何でしょうか？すべてのツールは、主要な属性を保持するスーパークラス `Tool` に依存しています。私たちは、それを継承したクラスを作成します:
-
-
-```python
-from transformers import Tool
-
-
-class HFModelDownloadsTool(Tool):
-    pass
-```
-
-このクラスにはいくつかの必要な要素があります：
- `name` 属性：これはツール自体の名前に対応し、他のツールと調和するために `model_download_counter` と名付けます。
- `description` 属性：これはエージェントのプロンプトを埋めるために使用されます。
- `inputs` と `outputs` 属性：これらを定義することで、Python インタープリターが型に関する賢明な選択を行うのに役立ち、ツールをHubにプッシュする際にgradio-demoを生成できるようになります。これらは、予想される値のリストであり、`text`、`image`、または`audio`になることがあります。
- `__call__` メソッド：これには推論コードが含まれています。これは上記で試したコードです！
-
-こちらが現在のクラスの外観です：
-
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = (
-        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
-        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
-        "returns the name of the checkpoint."
-    )
-
-    inputs = ["text"]
-    outputs = ["text"]
-
-    def __call__(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-さて、今度はツールが使えるようになりました。このツールをファイルに保存し、メインスクリプトからインポートしましょう。このファイルを `model_downloads.py` という名前にし、結果のインポートコードは次のようになります：
-
-以下は、現在のクラスの外観です：
-
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-他の人々に利益をもたらし、より簡単な初期化のために、それをHubにあなたの名前空間でプッシュすることをお勧めします。これを行うには、`tool` 変数で `push_to_hub` を呼び出すだけです：
-
-```python
-tool.push_to_hub("hf-model-downloads")
-```
-
-エージェントがツールを使用する方法について、最終ステップを見てみましょう。
-
-#### Having the agent use the tool
-
-Hubにあるツールがあります。これは次のようにインスタンス化できます（ユーザー名をツールに合わせて変更してください）:
-
-```python
-from transformers import load_tool
-
-tool = load_tool("lysandre/hf-model-downloads")
-```
-
-エージェントで使用するためには、エージェントの初期化メソッドの `additional_tools` パラメータにそれを渡すだけです：
-
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run(
-    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
-)
-```
-which outputs the following:
-```text
-==Code generated by the agent==
-model = model_download_counter(task="text-to-video")
-print(f"The model with the most downloads is {model}.")
-audio_model = text_reader(model)
-
-
-==Result==
-The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
-```
-
-以下のテキストは、次のオーディオを生成します。
-
-
-
-**Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-
-
-<Tip>
-
-特定のLLMに依存することがあり、うまく機能させるためには非常に正確なプロンプトが必要なものもあります。ツールの名前と説明を明確に定義することは、エージェントによって活用されるために非常に重要です。
-
-</Tip>
-
-### Replacing existing tools
-
-既存のツールを置き換えるには、新しいアイテムをエージェントのツールボックスに割り当てるだけで行うことができます。以下はその方法です:
-
-```python
-from transformers import HfAgent, load_tool
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
-```
-
-<Tip>
-
-他のツールでツールを置き換える際には注意が必要です！これにより、エージェントのプロンプトも調整されます。これは、タスクに適したより良いプロンプトを持っている場合には良いことですが、他のツールが選択される確率が高くなり、定義したツールの代わりに他のツールが選択されることもあるかもしれません。
-
-</Tip>
-
-## Leveraging gradio-tools
-
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools)は、Hugging Face Spacesをツールとして使用することを可能にする強力なライブラリです。既存の多くのSpacesおよびカスタムSpacesを設計することもサポートしています。
-
-我々は、`gradio_tools`を使用して`StableDiffusionPromptGeneratorTool`ツールを活用したいと考えています。このツールは`gradio-tools`ツールキットで提供されており、プロンプトを改善し、より良い画像を生成するために使用します。
-
-まず、`gradio_tools`からツールをインポートし、それをインスタンス化します:
-
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-
-gradio_tool = StableDiffusionPromptGeneratorTool()
-```
-
-そのインスタンスを `Tool.from_gradio` メソッドに渡します：
-
-
-```python
-from transformers import Tool
-
-tool = Tool.from_gradio(gradio_tool)
-```
-
-これからは、通常のカスタムツールと同じようにそれを管理できます。私たちはプロンプトを改善するためにそれを活用します。
-` a rabbit wearing a space suit`:
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
-```
-
-The model adequately leverages the tool:
-```text
-==Explanation from the agent==
-I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
-
-
-==Code generated by the agent==
-improved_prompt = StableDiffusionPromptGenerator(prompt)
-print(f"The improved prompt is {improved_prompt}.")
-image = image_generator(improved_prompt)
-```
-
-最終的に画像を生成する前に：
-
-![画像](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
-
-<Tip warning={true}>
-
-gradio-toolsは、さまざまなモダリティを使用する場合でも、*テキスト*の入力と出力が必要です。この実装は画像と音声オブジェクトと連携します。現時点では、これら2つは互換性がありませんが、サポートを向上させるために取り組んでおり、迅速に互換性が向上するでしょう。
-
-</Tip>
-
-## Future compatibility with Langchain
-
-私たちはLangchainを愛しており、非常に魅力的なツールのスイートを持っていると考えています。これらのツールを扱うために、Langchainはさまざまなモダリティで作業する場合でも、*テキスト*の入出力が必要です。これは、オブジェクトのシリアル化バージョン（つまり、ディスクに保存されたバージョン）であることが多いです。
-
-この違いにより、transformers-agentsとlangchain間ではマルチモダリティが処理されていません。
-この制限は将来のバージョンで解決されることを目指しており、熱心なlangchainユーザーからの任意の支援を歓迎します。
-
-私たちはより良いサポートを提供したいと考えています。お手伝いいただける場合は、ぜひ[問題を開いて](https://github.com/huggingface/transformers/issues/new)、お考えのことを共有してください。
-
-
--- a/docs/source/ja/generation_strategies.md
+++ b/docs/source/ja/generation_strategies.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 テキスト生成は、オープンエンドのテキスト生成、要約、翻訳など、多くの自然言語処理タスクに不可欠です。また、テキストを出力とするさまざまな混在モダリティアプリケーションにも影響を与えており、例えば音声からテキストへの変換や画像からテキストへの変換などがあります。テキストを生成できるいくつかのモデルには、GPT2、XLNet、OpenAI GPT、CTRL、TransformerXL、XLM、Bart、T5、GIT、Whisperが含まれます。

-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して、異なるタスクのテキスト出力を生成するいくつかの例をご紹介します：
+[`~generation.GenerationMixin.generate`] メソッドを使用して、異なるタスクのテキスト出力を生成するいくつかの例をご紹介します：
 * [テキスト要約](./tasks/summarization#inference)
 * [画像のキャプション](./model_doc/git#transformers.GitForCausalLM.forward.example)
 * [音声の転記](./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example)
@ -342,4 +342,4 @@ culture, and they allow us to design the'
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are going to the same party. It is a small party, in a small']
-```
+```
--- a/docs/source/ja/main_classes/agent.md
+++ b/docs/source/ja/main_classes/agent.md
@ -18,88 +18,9 @@ rendered properly in your Markdown viewer.

 <Tip warning={true}>

-Transformers Agents は実験的な API であり、いつでも変更される可能性があります。エージェントから返される結果
-API または基礎となるモデルは変更される傾向があるため、変更される可能性があります。
+The Agents framework has significantly changed in version v4.41.0.
+This document has been removed as it was referencing an older API.

-</Tip>
+We eagerly welcome new contributions for the updated API.

-エージェントとツールの詳細については、[入門ガイド](../transformers_agents) を必ずお読みください。このページ
-基礎となるクラスの API ドキュメントが含まれています。
-
-## エージェント
-
-私たちは 3 種類のエージェントを提供します。[`HfAgent`] はオープンソース モデルの推論エンドポイントを使用し、[`LocalAgent`] は選択したモデルをローカルで使用し、[`OpenAiAgent`] は OpenAI クローズド モデルを使用します。
-
-### HfAgent
-
-[[autodoc]] HfAgent
-
-### LocalAgent
-
-[[autodoc]] LocalAgent
-
-### OpenAiAgent
-
-[[autodoc]] OpenAiAgent
-
-### AzureOpenAiAgent
-
-[[autodoc]] AzureOpenAiAgent
-
-### Agent
-
-[[autodoc]] Agent
-    - chat
-    - run
-    - prepare_for_new_chat
-
-## Tools
-
-### load_tool
-
-[[autodoc]] load_tool
-
-### Tool
-
-[[autodoc]] Tool
-
-### PipelineTool
-
-[[autodoc]] PipelineTool
-
-### RemoteTool
-
-[[autodoc]] RemoteTool
-
-### launch_gradio_demo
-
-[[autodoc]] launch_gradio_demo
-
-## エージェントの種類
-
-エージェントはツール間であらゆる種類のオブジェクトを処理できます。ツールは完全にマルチモーダルであるため、受け取りと返品が可能です
-テキスト、画像、オーディオ、ビデオなどのタイプ。ツール間の互換性を高めるためだけでなく、
-これらの戻り値を ipython (jupyter、colab、ipython ノートブックなど) で正しくレンダリングするには、ラッパー クラスを実装します。
-このタイプの周り。
-
-ラップされたオブジェクトは最初と同じように動作し続けるはずです。テキストオブジェクトは依然として文字列または画像として動作する必要があります
-オブジェクトは依然として `PIL.Image` として動作するはずです。
-
-これらのタイプには、次の 3 つの特定の目的があります。
-
- 型に対して `to_raw` を呼び出すと、基になるオブジェクトが返されるはずです
- 型に対して `to_string` を呼び出すと、オブジェクトを文字列として返す必要があります。`AgentText` の場合は文字列になる可能性があります。
-  ただし、他のインスタンスのオブジェクトのシリアル化されたバージョンのパスになります。
- ipython カーネルで表示すると、オブジェクトが正しく表示されるはずです
-
-### AgentText
-
-[[autodoc]] transformers.tools.agent_types.AgentText
-
-### AgentImage
-
-[[autodoc]] transformers.tools.agent_types.AgentImage
-
-### AgentAudio
-
-[[autodoc]] transformers.tools.agent_types.AgentAudio
+</Tip>
--- a/docs/source/ja/model_doc/code_llama.md
+++ b/docs/source/ja/model_doc/code_llama.md
@ -23,7 +23,7 @@ Code Llama モデルはによって [Code Llama: Open Foundation Models for Code

 *私たちは Code Llama をリリースします。これは Llama 2 に基づくコードの大規模言語モデル ファミリであり、オープン モデルの中で最先端のパフォーマンス、埋め込み機能、大規模な入力コンテキストのサポート、プログラミング タスクのゼロショット命令追従機能を提供します。 。幅広いアプリケーションをカバーするための複数のフレーバーを提供しています。基盤モデル (Code Llama)、Python 特化 (Code Llama - Python)、およびそれぞれ 7B、13B、および 34B パラメーターを備えた命令追従モデル (Code Llama - Instruct) です。すべてのモデルは 16,000 トークンのシーケンスでトレーニングされ、最大 100,000 トークンの入力で改善が見られます。 7B および 13B コード ラマとコード ラマ - 命令バリアントは、周囲のコンテンツに基づいた埋め込みをサポートします。 Code Llama は、いくつかのコード ベンチマークでオープン モデルの中で最先端のパフォーマンスに達し、HumanEval と MBPP でそれぞれ最大 53% と 55% のスコアを獲得しました。特に、Code Llama - Python 7B は HumanEval および MBPP 上で Llama 2 70B よりも優れたパフォーマンスを示し、すべてのモデルは MultiPL-E 上で公開されている他のすべてのモデルよりも優れています。私たちは、研究と商業利用の両方を許可する寛容なライセンスに基づいて Code Llama をリリースしています。*

-すべての Code Llama モデル チェックポイントを [こちら](https://huggingface.co/models?search=code_llama) で確認し、[codellama org](https://huggingface.co/codellama) で正式にリリースされたチェックポイントを確認してください。
+すべての Code Llama モデル チェックポイントを [こちら](https://huggingface.co/models?search=code_llama) で確認し、[meta llama org](https://huggingface.co/meta-llama) で正式にリリースされたチェックポイントを確認してください。

 このモデルは [ArthurZucker](https://huggingface.co/ArthurZ) によって提供されました。著者のオリジナルのコードは [こちら](https://github.com/facebookresearch/llama) にあります。

@ -60,8 +60,8 @@ python src/transformers/models/llama/convert_llama_weights_to_hf.py \
 ```python
 >>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer

->>> tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
->>> model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf")
+>>> tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+>>> model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
 >>> PROMPT = '''def remove_non_ascii(s: str) -> str:
    """ <FILL_ME>
    return result
@ -93,7 +93,7 @@ def remove_non_ascii(s: str) -> str:
 >>> from transformers import pipeline
 >>> import torch

->>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
+>>> generator = pipeline("text-generation",model="meta-llama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
 >>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128)
 [{'generated_text': 'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return resultRemove non-ASCII characters from a string. """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c'}]
 ```
--- a/docs/source/ja/tasks/language_modeling.md
+++ b/docs/source/ja/tasks/language_modeling.md
@ -388,7 +388,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 >>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
 ```

-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用してテキストを生成します。
+[`~generation.GenerationMixin.generate`] メソッドを使用してテキストを生成します。
 さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[テキスト生成戦略](../generation_strategies) ページを参照してください。

 ```py
--- a/docs/source/ja/tasks/summarization.md
+++ b/docs/source/ja/tasks/summarization.md
@ -358,7 +358,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して要約を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+[`~generation.GenerationMixin.generate`] メソッドを使用して要約を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。

 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
--- a/docs/source/ja/tasks/translation.md
+++ b/docs/source/ja/tasks/translation.md
@ -366,7 +366,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して翻訳を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+[`~generation.GenerationMixin.generate`] メソッドを使用して翻訳を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。


 ```py
--- a/docs/source/ko/custom_tools.md
+++ b/docs/source/ko/custom_tools.md
@ -12,737 +12,11 @@ specific language governing permissions and limitations under the License.

 # 사용자 정의 도구와 프롬프트[[custom-tools-and-prompts]]

-<Tip>
-
-Transformers와 관련하여 어떤 도구와 에이전트가 있는지 잘 모르신다면 [Transformers Agents](transformers_agents) 페이지를 먼저 읽어보시기 바랍니다. 
-
-</Tip>
-
 <Tip warning={true}>

-Transformers Agents는 실험 중인 API로 언제든지 변경될 수 있습니다. 
-API 또는 기반 모델이 변경되기 쉽기 때문에 에이전트가 반환하는 결과도 달라질 수 있습니다.
+The Agents framework has significantly changed in version v4.41.0.
+This document has been removed as it was referencing an older API.
+
+We eagerly welcome new contributions for the updated API.

 </Tip>
-
-에이전트에게 권한을 부여하고 새로운 작업을 수행하게 하려면 사용자 정의 도구와 프롬프트를 만들고 사용하는 것이 무엇보다 중요합니다.
-이 가이드에서는 다음과 같은 내용을 살펴보겠습니다:
-
- 프롬프트를 사용자 정의하는 방법
- 사용자 정의 도구를 사용하는 방법
- 사용자 정의 도구를 만드는 방법
-
-## 프롬프트를 사용자 정의하기[[customizing-the-prompt]]
-
-[Transformers Agents](transformers_agents)에서 설명한 것처럼 에이전트는 [`~Agent.run`] 및 [`~Agent.chat`] 모드에서 실행할 수 있습니다.
-`run`(실행) 모드와 `chat`(채팅) 모드 모두 동일한 로직을 기반으로 합니다. 
-에이전트를 구동하는 언어 모델은 긴 프롬프트에 따라 조건이 지정되고, 중지 토큰에 도달할 때까지 다음 토큰을 생성하여 프롬프트를 완수합니다.
-`chat` 모드에서는 프롬프트가 이전 사용자 입력 및 모델 생성으로 연장된다는 점이 두 모드의 유일한 차이점입니다.
-이를 통해 에이전트가 과거 상호작용에 접근할 수 있게 되므로 에이전트에게 일종의 메모리를 제공하는 셈입니다.
-
-### 프롬프트의 구조[[structure-of-the-prompt]]
-
-어떻게 프롬프트 사용자 정의를 잘 할 수 있는지 이해하기 위해 프롬프트의 구조를 자세히 살펴봅시다.
-프롬프트는 크게 네 부분으로 구성되어 있습니다.
-
- 1. 도입: 에이전트가 어떻게 행동해야 하는지, 도구의 개념에 대한 설명.
- 2. 모든 도구에 대한 설명. 이는 런타임에 사용자가 정의/선택한 도구로 동적으로 대체되는 `<<all_tools>>` 토큰으로 정의됩니다.
- 3. 작업 예제 및 해당 솔루션 세트.
- 4. 현재 예제 및 해결 요청.
-
-각 부분을 더 잘 이해할 수 있도록 짧은 버전을 통해 `run` 프롬프트가 어떻게 보이는지 살펴보겠습니다:
-
-````text
-I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
-[...]
-You can print intermediate results if it makes sense to do so.
-
-Tools:
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
-[...]
-
-Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
-
-I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
-
-Answer:
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(image=image, question=translated_question)
-print(f"The answer is {answer}")
-```
-
-Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-[...]
-
-Task: "Draw me a picture of rivers and lakes"
-
-I will use the following
-````
-
-도입(*"도구:"* 앞의 텍스트)에서는 모델이 어떻게 작동하고 무엇을 해야 하는지 정확하게 설명합니다.
-에이전트는 항상 같은 방식으로 작동해야 하므로 이 부분은 사용자 정의할 필요가 없을 가능성이 높습니다.
-
-두 번째 부분(*"도구"* 아래의 글머리 기호)은 `run` 또는 `chat`을 호출할 때 동적으로 추가됩니다. 
-정확히 `agent.toolbox`에 있는 도구 수만큼 글머리 기호가 있고, 각 글머리 기호는 도구의 이름과 설명으로 구성됩니다:
-
-```text
- <tool.name>: <tool.description>
-```
-
-문서 질의응답 도구를 가져오고 이름과 설명을 출력해서 빠르게 확인해 보겠습니다.
-
-```py
-from transformers import load_tool
-
-document_qa = load_tool("document-question-answering")
-print(f"- {document_qa.name}: {document_qa.description}")
-```
-
-그러면 다음 결과가 출력됩니다:
-```text
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
-```
-
-여기서 도구 이름이 짧고 정확하다는 것을 알 수 있습니다. 
-설명은 두 부분으로 구성되어 있는데, 첫 번째 부분에서는 도구의 기능을 설명하고 두 번째 부분에서는 예상되는 입력 인수와 반환 값을 명시합니다.
-
-에이전트가 도구를 올바르게 사용하려면 좋은 도구 이름과 도구 설명이 매우 중요합니다. 
-에이전트가 도구에 대해 알 수 있는 유일한 정보는 이름과 설명뿐이므로, 이 두 가지를 정확하게 작성하고 도구 상자에 있는 기존 도구의 스타일과 일치하는지 확인해야 합니다. 
-특히 이름에 따라 예상되는 모든 인수가 설명에 코드 스타일로 언급되어 있는지, 예상되는 유형과 그 유형이 무엇인지에 대한 설명이 포함되어 있는지 확인하세요.
-
-<Tip>
-
-도구에 어떤 이름과 설명이 있어야 하는지 이해하려면 엄선된 Transformers 도구의 이름과 설명을 확인하세요. 
-[`Agent.toolbox`] 속성을 가진 모든 도구를 볼 수 있습니다.
-
-</Tip>
-
-세 번째 부분에는 에이전트가 어떤 종류의 사용자 요청에 대해 어떤 코드를 생성해야 하는지 정확하게 보여주는 엄선된 예제 세트가 포함되어 있습니다. 
-에이전트를 지원하는 대규모 언어 모델은 프롬프트에서 패턴을 인식하고 새로운 데이터로 패턴을 반복하는 데 매우 능숙합니다. 
-따라서 에이전트가 실제로 올바른 실행 가능한 코드를 생성할 가능성을 극대화하는 방식으로 예제를 작성하는 것이 매우 중요합니다. 
-
-한 가지 예를 살펴보겠습니다:
-
-````text
-Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
-
-I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-
-Answer:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator("A banner showing " + answer)
-```
-
-````
-작업 설명, 에이전트가 수행하려는 작업에 대한 설명, 마지막으로 생성된 코드, 이 세 부분으로 구성된 프롬프트는 모델에 반복하여 제공됩니다. 
-프롬프트의 일부인 모든 예제는 이러한 정확한 패턴으로 되어 있으므로, 에이전트가 새 토큰을 생성할 때 정확히 동일한 패턴을 재현할 수 있습니다.
-
-프롬프트 예제는 Transformers 팀이 선별하고 일련의 [problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)에 따라 엄격하게 평가하여 
-에이전트의 프롬프트가 에이전트의 실제 사용 사례를 최대한 잘 해결할 수 있도록 보장합니다.
-
-프롬프트의 마지막 부분은 다음에 해당합니다:
-```text
-Task: "Draw me a picture of rivers and lakes"
-
-I will use the following
-```
-
-이는 에이전트가 완료해야 할 최종적인 미완성 예제입니다. 미완성 예제는 실제 사용자 입력에 따라 동적으로 만들어집니다. 
-위 예시의 경우 사용자가 다음과 같이 실행했습니다:
-
-```py
-agent.run("Draw me a picture of rivers and lakes")
-```
-
-사용자 입력 - *즉* Task: *"Draw me a picture of rivers and lakes"*가 프롬프트 템플릿에 맞춰 "Task: <task> \n\n I will use the following"로 캐스팅됩니다. 
-이 문장은 에이전트에게 조건이 적용되는 프롬프트의 마지막 줄을 구성하므로 에이전트가 이전 예제에서 수행한 것과 정확히 동일한 방식으로 예제를 완료하도록 강력하게 영향을 미칩니다.
-
-너무 자세히 설명하지 않더라도 채팅 템플릿의 프롬프트 구조는 동일하지만 예제의 스타일이 약간 다릅니다. *예를 들면*:
-
-````text
-[...]
-
-=====
-
-Human: Answer the question in the variable `question` about the image stored in the variable `image`.
-
-Assistant: I will use the tool `image_qa` to answer the question on the input image.
-
-```py
-answer = image_qa(text=question, image=image)
-print(f"The answer is {answer}")
-```
-
-Human: I tried this code, it worked but didn't give me a good result. The question is in French
-
-Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
-
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(text=translated_question, image=image)
-print(f"The answer is {answer}")
-```
-
-=====
-
-[...]
-````
-
-`run` 프롬프트의 예와는 반대로, 각 `chat` 프롬프트의 예에는 *Human(사람)*과 *Assistant(어시스턴트)* 간에 하나 이상의 교환이 있습니다. 모든 교환은 `run` 프롬프트의 예와 유사한 구조로 되어 있습니다.
-사용자의 입력이 *Human:* 뒤에 추가되며, 에이전트에게 코드를 생성하기 전에 수행해야 할 작업을 먼저 생성하라는 메시지가 표시됩니다. 
-교환은 이전 교환을 기반으로 할 수 있으므로 위와 같이 사용자가 "**이** 코드를 시도했습니다"라고 입력하면 이전에 생성된 에이전트의 코드를 참조하여 과거 교환을 참조할 수 있습니다.
-
-`.chat`을 실행하면 사용자의 입력 또는 *작업*이 미완성된 양식의 예시로 캐스팅됩니다:
-```text
-Human: <user-input>\n\nAssistant:
-```
-그러면 에이전트가 이를 완성합니다. `run` 명령과 달리 `chat` 명령은 완료된 예제를 프롬프트에 추가하여 에이전트에게 다음 `chat` 차례에 대한 더 많은 문맥을 제공합니다.
-
-이제 프롬프트가 어떻게 구성되어 있는지 알았으니 어떻게 사용자 정의할 수 있는지 살펴봅시다!
-
-### 좋은 사용자 입력 작성하기[[writing-good-user-inputs]]
-
-대규모 언어 모델이 사용자의 의도를 이해하는 능력이 점점 더 향상되고 있지만, 에이전트가 올바른 작업을 선택할 수 있도록 최대한 정확성을 유지하는 것은 큰 도움이 됩니다. 
-최대한 정확하다는 것은 무엇을 의미할까요?
-
-에이전트는 프롬프트에서 도구 이름 목록과 해당 설명을 볼 수 있습니다. 
-더 많은 도구가 추가될수록 에이전트가 올바른 도구를 선택하기가 더 어려워지고 실행할 도구의 올바른 순서를 선택하는 것은 더욱 어려워집니다. 
-일반적인 실패 사례를 살펴보겠습니다. 여기서는 분석할 코드만 반환하겠습니다.
-
-```py
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-
-agent.run("Show me a tree", return_code=True)
-```
-
-그러면 다음 결과가 출력됩니다:
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
-
-
-==Code generated by the agent==
-mask = image_segmenter(image, prompt="tree")
-```
-
-우리가 원했던 결과가 아닐 수도 있습니다. 대신 나무 이미지가 생성되기를 원할 가능성이 더 높습니다.
-따라서 에이전트가 특정 도구를 사용하도록 유도하려면 도구의 이름과 설명에 있는 중요한 키워드를 사용하는 것이 매우 유용할 수 있습니다. 한번 살펴보겠습니다.
-```py
-agent.toolbox["image_generator"].description
-```
-
-```text
-'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
-```
-
-이름과 설명은 "image", "prompt", "create" 및 "generate" 키워드를 사용합니다. 이 단어들을 사용하면 더 잘 작동할 가능성이 높습니다. 프롬프트를 조금 더 구체화해 보겠습니다.
-
-```py
-agent.run("Create an image of a tree", return_code=True)
-```
-
-이 코드는 다음 프롬프트를 만들어냅니다:
-```text
-==Explanation from the agent==
-I will use the following tool `image_generator` to generate an image of a tree.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="tree")
-```
-
-훨씬 낫네요! 저희가 원했던 것과 비슷해 보입니다. 
-즉, 에이전트가 작업을 올바른 도구에 올바르게 매핑하는 데 어려움을 겪고 있다면 도구 이름과 설명에서 가장 관련성이 높은 키워드를 찾아보고 이를 통해 작업 요청을 구체화해 보세요.
-
-### 도구 설명 사용자 정의하기[[customizing-the-tool-descriptions]]
-
-앞서 살펴본 것처럼 에이전트는 각 도구의 이름과 설명에 액세스할 수 있습니다. 
-기본 도구에는 매우 정확한 이름과 설명이 있어야 하지만 특정 사용 사례에 맞게 도구의 설명이나 이름을 변경하는 것이 도움이 될 수도 있습니다. 
-이는 매우 유사한 여러 도구를 추가했거나 특정 도메인(*예*: 이미지 생성 및 변환)에만 에이전트를 사용하려는 경우에 특히 중요해질 수 있습니다.
-
-일반적인 문제는 이미지 생성 작업에 많이 사용되는 경우 에이전트가 이미지 생성과 이미지 변환/수정을 혼동하는 것입니다. *예를 들어,*
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-그러면 다음 결과가 출력됩니다:
-```text
-==Explanation from the agent== 
-I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-house_car_image = image_transformer(image=car_image, prompt="A house")
-```
-
-결과물이 우리가 여기서 원하는 것과 정확히 일치하지 않을 수 있습니다. 에이전트가 `image_generator`와 `image_transformer`의 차이점을 이해하기 어려워서 두 가지를 함께 사용하는 경우가 많은 것 같습니다.
-
-여기서 `image_transformer`의 도구 이름과 설명을 변경하여 에이전트가 도울 수 있습니다. 
-"image" 및 "prompt"와 약간 분리하기 위해 `modifier`라고 대신 부르겠습니다:
-```py
-agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
-agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
-    "transforms an image according to a prompt", "modifies an image"
-)
-```
-
-이제 "modify"은 새 이미지 프로세서를 사용하라는 강력한 신호이므로 위의 프롬프트에 도움이 될 것입니다. 다시 실행해 봅시다.
-
-```py
-agent.run("Make an image of a house and a car", return_code=True)
-```
-
-여기서 다음과 같은 결과를 얻게 됩니다:
-```text
-==Explanation from the agent==
-I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
-
-
-==Code generated by the agent==
-house_image = image_generator(prompt="A house")
-car_image = image_generator(prompt="A car")
-```
-
-우리가 염두에 두었던 것과 확실히 더 가까워졌습니다! 하지만 집과 자동차가 모두 같은 이미지에 포함되면 좋겠습니다. 작업을 단일 이미지 생성에 더 집중하면 도움이 될 것입니다:
-
-```py
-agent.run("Create image: 'A house and car'", return_code=True)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_generator` to generate an image.
-
-
-==Code generated by the agent==
-image = image_generator(prompt="A house and car")
-```
-
-<Tip warning={true}>
-
-에이전트는 여전히 특히 여러 개체의 이미지를 생성하는 것과 같이 약간 더 복잡한 사용 사례에서 취약한 경우가 많습니다.
-앞으로 몇 달 안에 에이전트 자체와 기본 프롬프트가 더욱 개선되어 에이전트가 다양한 사용자 입력에 더욱 강력하게 대응할 수 있도록 할 예정입니다.
-
-</Tip>
-
-### 전체 프롬프트 사용자 정의하기[[customizing-the-whole-prompt]]
-
-사용자에게 최대한의 유연성을 제공하기 위해 [위](#structure-of-the-prompt)에 설명된 전체 프롬프트 템플릿을 사용자가 덮어쓸 수 있습니다. 
-이 경우 사용자 정의 프롬프트에 소개 섹션, 도구 섹션, 예제 섹션 및 미완성 예제 섹션이 포함되어 있는지 확인하세요. 
-`run` 프롬프트 템플릿을 덮어쓰려면 다음과 같이 하면 됩니다:
-
-```py
-template = """ [...] """
-
-agent = HfAgent(your_endpoint, run_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-에이전트가 사용 가능한 도구를 인식하고 사용자의 프롬프트를 올바르게 삽입할 수 있도록 `<<all_tools>>` 문자열과 `<<prompt>>`를 `template` 어딘가에 정의해야 합니다.
-
-</Tip>
-
-마찬가지로 `chat` 프롬프트 템플릿을 덮어쓸 수 있습니다. `chat` 모드에서는 항상 다음과 같은 교환 형식을 사용한다는 점에 유의하세요:
-
-```text
-Human: <<task>>
-
-Assistant:
-```
-
-따라서 사용자 정의 `chat` 프롬프트 템플릿의 예제에서도 이 형식을 사용하는 것이 중요합니다. 
-다음과 같이 인스턴스화 할 때 `chat` 템플릿을 덮어쓸 수 있습니다.
-
-```python
-template = """ [...] """
-
-agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
-```
-
-<Tip warning={true}>
-
-에이전트가 사용 가능한 도구를 인식할 수 있도록 `<<all_tools>>` 문자열을 `template` 어딘가에 정의해야 합니다.
-
-</Tip>
-
-두 경우 모두 커뮤니티의 누군가가 호스팅하는 템플릿을 사용하려는 경우 프롬프트 템플릿 대신 저장소 ID를 전달할 수 있습니다. 
-기본 프롬프트는 [이 저장소](https://huggingface.co/datasets/huggingface-tools/default-prompts)를 예로 들 수 있습니다.
-
-Hub의 저장소에 사용자 정의 프롬프트를 업로드하여 커뮤니티와 공유하려면 다음을 확인하세요:
- 데이터 세트 저장소를 사용하세요.
- `run` 명령에 대한 프롬프트 템플릿을 `run_prompt_template.txt`라는 파일에 넣으세요.
- `chat` 명령에 대한 프롬프트 템플릿을 `chat_prompt_template.txt`라는 파일에 넣으세요.
-
-## 사용자 정의 도구 사용하기[[using-custom-tools]]
-
-이 섹션에서는 이미지 생성에 특화된 두 가지 기존 사용자 정의 도구를 활용하겠습니다:
-
- 더 많은 이미지 수정을 허용하기 위해 [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation)을 
-  [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool)로 대체합니다.
- 기본 도구 상자에 이미지 업스케일링을 위한 새로운 도구가 추가되었습니다: 
-  [diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool)가 기존 이미지 변환 도구를 대체합니다.
-
-편리한 [`load_tool`] 함수를 사용하여 사용자 정의 도구를 가져오는 것으로 시작하겠습니다:
-
-```py
-from transformers import load_tool
-
-controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
-upscaler = load_tool("diffusers/latent-upscaler-tool")
-```
-
-에이전트에게 사용자 정의 도구를 추가하면 도구의 설명과 이름이 에이전트의 프롬프트에 자동으로 포함됩니다. 
-따라서 에이전트가 사용 방법을 이해할 수 있도록 사용자 정의 도구의 설명과 이름을 잘 작성해야 합니다.
-`controlnet_transformer`의 설명과 이름을 살펴보겠습니다:
-
-```py
-print(f"Description: '{controlnet_transformer.description}'")
-print(f"Name: '{controlnet_transformer.name}'")
-```
-
-그러면 다음 결과가 출력됩니다:
-```text
-Description: 'This is a tool that transforms an image with ControlNet according to a prompt. 
-It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
-Name: 'image_transformer'
-```
-
-이름과 설명이 정확하고 [큐레이팅 된 도구 세트(curated set of tools)](./transformers_agents#a-curated-set-of-tools)의 스타일에 맞습니다.
-다음으로, `controlnet_transformer`와 `upscaler`로 에이전트를 인스턴스화해 봅시다:
-```py
-tools = [controlnet_transformer, upscaler]
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
-```
-
-이 명령을 실행하면 다음 정보가 표시됩니다:
-
-```text
-image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
-8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
-```
-
-큐레이팅된 도구 세트에는 이미 'image_transformer' 도구가 있으며, 이 도구는 사용자 정의 도구로 대체됩니다.
-
-<Tip>
-
-기존 도구와 똑같은 작업에 사용자 정의 도구를 사용하려는 경우 기존 도구를 덮어쓰는 것이 유용할 수 있습니다. 
-에이전트가 해당 작업에 능숙하기 때문입니다.
-이 경우 사용자 정의 도구가 덮어쓴 도구와 정확히 동일한 API를 따라야 하며, 그렇지 않으면 해당 도구를 사용하는 모든 예제가 업데이트되도록 프롬프트 템플릿을 조정해야 한다는 점에 유의하세요.
-
-</Tip>
-
-업스케일러 도구에 지정된 'image_upscaler'라는 이름 아직 기본 도구 상자에는 존재하지 않기 때문에, 도구 목록에 해당 이름이 간단히 추가되었습니다.
-에이전트가 현재 사용할 수 있는 도구 상자는 언제든지 `agent.toolbox` 속성을 통해 확인할 수 있습니다:
-
-```py
-print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
-```
-
-```text
- document_qa
- image_captioner
- image_qa
- image_segmenter
- transcriber
- summarizer
- text_classifier
- text_qa
- text_reader
- translator
- image_transformer
- text_downloader
- image_generator
- video_generator
- image_upscaler
-```
-
-에이전트의 도구 상자에 `image_upscaler`가 추가된 점을 주목하세요.
-
-이제 새로운 도구를 사용해봅시다! [Transformers Agents Quickstart](./transformers_agents#single-execution-run)에서 생성한 이미지를 다시 사용하겠습니다.
-
-```py
-from diffusers.utils import load_image
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
-)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
-
-이미지를 아름다운 겨울 풍경으로 바꿔 봅시다:
-
-```py
-image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_transformer` to transform the image.
-
-
-==Code generated by the agent==
-image = image_transformer(image, prompt="A frozen lake and snowy forest")
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200> 
-
-새로운 이미지 처리 도구는 이미지를 매우 강력하게 수정할 수 있는 ControlNet을 기반으로 합니다.
-기본적으로 이미지 처리 도구는 512x512 픽셀 크기의 이미지를 반환합니다. 이를 업스케일링할 수 있는지 살펴봅시다.
-
-```py
-image = agent.run("Upscale the image", image)
-```
-
-```text
-==Explanation from the agent==
-I will use the following tool: `image_upscaler` to upscale the image.
-
-
-==Code generated by the agent==
-upscaled_image = image_upscaler(image)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400> 
-
-에이전트는 업스케일러 도구의 설명과 이름만 보고 방금 추가한 업스케일러 도구에 "이미지 업스케일링"이라는 프롬프트를 자동으로 매핑하여 올바르게 실행했습니다.
-
-다음으로 새 사용자 정의 도구를 만드는 방법을 살펴보겠습니다.
-
-### 새 도구 추가하기[[adding-new-tools]]
-
-이 섹션에서는 에이전트에게 추가할 수 있는 새 도구를 만드는 방법을 보여 드립니다.
-
-#### 새 도구 만들기[[creating-a-new-tool]]
-
-먼저 도구를 만드는 것부터 시작하겠습니다. 
-특정 작업에 대해 가장 많은 다운로드를 받은 Hugging Face Hub의 모델을 가져오는, 그다지 유용하지는 않지만 재미있는 작업을 추가하겠습니다.
-
-다음 코드를 사용하면 됩니다:
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-`text-classification`(텍스트 분류) 작업의 경우 `'facebook/bart-large-mnli'`를 반환하고, `translation`(번역) 작업의 경우 `'google-t5/t5-base'`를 반환합니다.
-
-이를 에이전트가 활용할 수 있는 도구로 변환하려면 어떻게 해야 할까요? 
-모든 도구는 필요한 주요 속성을 보유하는 슈퍼클래스 `Tool`에 의존합니다. 이를 상속하는 클래스를 만들어 보겠습니다:
-
-```python
-from transformers import Tool
-
-
-class HFModelDownloadsTool(Tool):
-    pass
-```
-
-이 클래스에는 몇 가지 요구사항이 있습니다:
- 도구 자체의 이름에 해당하는 `name` 속성. 수행명이 있는 다른 도구와 호환되도록 `model_download_counter`로 이름을 지정하겠습니다.
- 에이전트의 프롬프트를 채우는 데 사용되는 속성 `description`.
- `inputs` 및 `outputs` 속성. 이를 정의하면 Python 인터프리터가 유형에 대한 정보에 입각한 선택을 하는 데 도움이 되며, 
-  도구를 허브에 푸시할 때 gradio 데모를 생성할 수 있습니다. 
-  두 속성 모두 값은 '텍스트', '이미지' 또는 '오디오'가 될 수 있는 예상 값의 리스트입니다.
- 추론 코드가 포함된 `__call__` 메소드. 이것이 우리가 위에서 다루었던 코드입니다!
-
-이제 클래스의 모습은 다음과 같습니다:
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = (
-        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
-        "It takes the name of the category (such as text-classification, depth-estimation, etc), and "
-        "returns the name of the checkpoint."
-    )
-
-    inputs = ["text"]
-    outputs = ["text"]
-
-    def __call__(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-이제 도구를 손쉽게 사용할 수 있게 되었습니다. 
-도구를 파일에 저장하고 메인 스크립트에서 가져옵니다. 이 파일의 이름을 `model_downloads.py`로 지정하면 결과적으로 가져오기 코드는 다음과 같습니다:
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-다른 사람들이 이 기능을 활용할 수 있도록 하고 초기화를 더 간단하게 하려면 네임스페이스 아래의 Hub로 푸시하는 것이 좋습니다. 
-그렇게 하려면 `tool` 변수에서 `push_to_hub`를 호출하면 됩니다:
-
-```python
-tool.push_to_hub("hf-model-downloads")
-```
-
-이제 허브에 코드가 생겼습니다! 마지막 단계인 에이전트가 코드를 사용하도록 하는 단계를 살펴보겠습니다.
-
-#### 에이전트가 도구를 사용하게 하기[[Having-the-agent-use-the-tool]]
-
-이제 이런 식으로 허브에 존재하는 도구를 인스턴스화할 수 있습니다(도구의 사용자 이름은 변경하세요):
-We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
-
-```python
-from transformers import load_tool
-
-tool = load_tool("lysandre/hf-model-downloads")
-```
-
-이 도구를 에이전트에서 사용하려면 에이전트 초기화 메소드의 `additional_tools` 매개변수에 전달하기만 하면 됩니다:
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run(
-    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
-)
-```
-그러면 다음과 같은 결과가 출력됩니다:
-```text
-==Code generated by the agent==
-model = model_download_counter(task="text-to-video")
-print(f"The model with the most downloads is {model}.")
-audio_model = text_reader(model)
-
-
-==Result==
-The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
-```
-
-and generates the following audio.
-
-| **Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-
-
-<Tip>
-
-LLM에 따라 일부는 매우 취약하기 때문에 제대로 작동하려면 매우 정확한 프롬프트가 필요합니다. 
-에이전트가 도구를 잘 활용하기 위해서는 도구의 이름과 설명을 잘 정의하는 것이 무엇보다 중요합니다.
-
-</Tip>
-
-### 기존 도구 대체하기[[replacing-existing-tools]]
-
-에이전트의 도구 상자에 새 항목을 배정하기만 하면 기존 도구를 대체할 수 있습니다. 방법은 다음과 같습니다:
-
-```python
-from transformers import HfAgent, load_tool
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
-agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
-```
-
-<Tip>
-
-다른 도구로 교체할 때는 주의하세요! 이 작업으로 에이전트의 프롬프트도 조정됩니다. 
-작업에 더 적합한 프롬프트가 있으면 좋을 수 있지만, 
-다른 도구보다 더 많이 선택되거나 정의한 도구 대신 다른 도구가 선택될 수도 있습니다.
-
-</Tip>
-
-## gradio-tools 사용하기[[leveraging-gradio-tools]]
-
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools)는 Hugging Face Spaces를 도구로 사용할 수 있는 강력한 라이브러리입니다. 
-기존의 많은 Spaces뿐만 아니라 사용자 정의 Spaces를 사용하여 디자인할 수 있도록 지원합니다.
-
-우리는 `Tool.from_gradio` 메소드를 사용하여 `gradio_tools`에 대한 지원을 제공합니다. 
-예를 들어, 프롬프트를 개선하고 더 나은 이미지를 생성하기 위해 `gradio-tools` 툴킷에서 제공되는 `StableDiffusionPromptGeneratorTool` 도구를 활용하고자 합니다.
-
-먼저 `gradio_tools`에서 도구를 가져와서 인스턴스화합니다:
-
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-
-gradio_tool = StableDiffusionPromptGeneratorTool()
-```
-
-해당 인스턴스를 `Tool.from_gradio` 메소드에 전달합니다:
-
-```python
-from transformers import Tool
-
-tool = Tool.from_gradio(gradio_tool)
-```
-
-이제 일반적인 사용자 정의 도구와 똑같이 관리할 수 있습니다. 
-이를 활용하여 `a rabbit wearing a space suit'(우주복을 입은 토끼)라는 프롬프트를 개선했습니다:
-
-```python
-from transformers import HfAgent
-
-agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
-
-agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
-```
-
-모델이 도구를 적절히 활용합니다:
-```text
-==Explanation from the agent==
-I will use the following  tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
-
-
-==Code generated by the agent==
-improved_prompt = StableDiffusionPromptGenerator(prompt)
-print(f"The improved prompt is {improved_prompt}.")
-image = image_generator(improved_prompt)
-```
-
-마지막으로 이미지를 생성하기 전에:
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
-
-<Tip warning={true}>
-
-gradio-tools는 다른 모달리티로 작업할 때에도 *텍스트* 입력 및 출력을 필요로 합니다. 
-이 구현은 이미지 및 오디오 객체에서 작동합니다. 
-현재는 이 두 가지가 호환되지 않지만 지원 개선을 위해 노력하면서 빠르게 호환될 것입니다.
-
-</Tip>
-
-## 향후 Langchain과의 호환성[[future-compatibility-with-langchain]]
-
-저희는 Langchain을 좋아하며 매우 매력적인 도구 모음을 가지고 있다고 생각합니다. 
-이러한 도구를 처리하기 위해 Langchain은 다른 모달리티와 작업할 때에도 *텍스트* 입력과 출력을 필요로 합니다.
-이는 종종 객체의 직렬화된(즉, 디스크에 저장된) 버전입니다.
-
-이 차이로 인해 transformers-agents와 Langchain 간에는 멀티 모달리티가 처리되지 않습니다. 
-향후 버전에서 이 제한이 해결되기를 바라며, 이 호환성을 달성할 수 있도록 열렬한 Langchain 사용자의 도움을 환영합니다.
-
-저희는 더 나은 지원을 제공하고자 합니다. 도움을 주고 싶으시다면, [이슈를 열어](https://github.com/huggingface/transformers/issues/new) 의견을 공유해 주세요.
--- a/docs/source/ko/generation_strategies.md
+++ b/docs/source/ko/generation_strategies.md
@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 텍스트 생성은 개방형 텍스트 작성, 요약, 번역 등 다양한 자연어 처리(NLP) 작업에 필수적입니다. 이는 또한 음성-텍스트 변환, 시각-텍스트 변환과 같이 텍스트를 출력으로 하는 여러 혼합 모달리티 응용 프로그램에서도 중요한 역할을 합니다. 텍스트 생성을 가능하게 하는 몇몇 모델로는 GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper 등이 있습니다.


-[`~transformers.generation_utils.GenerationMixin.generate`] 메서드를 활용하여 다음과 같은 다양한 작업들에 대해 텍스트 결과물을 생성하는 몇 가지 예시를 살펴보세요:
+[`~generation.GenerationMixin.generate`] 메서드를 활용하여 다음과 같은 다양한 작업들에 대해 텍스트 결과물을 생성하는 몇 가지 예시를 살펴보세요:
 * [텍스트 요약](./tasks/summarization#inference)
 * [이미지 캡셔닝](./model_doc/git#transformers.GitForCausalLM.forward.example)
 * [오디오 전사](./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example)
@ -333,5 +333,5 @@ culture, and they allow us to design the'
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are going to the same party. It is a small party, in a small']
+['Alice and Bob, who were both in their early twenties, were both in the process of']
 ```
--- a/docs/source/ko/tasks/language_modeling.md
+++ b/docs/source/ko/tasks/language_modeling.md
@ -366,7 +366,7 @@ TensorFlow에서 모델을 미세 조정하려면, 먼저 옵티마이저 함수
 >>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
 ```

-[`~transformers.generation_utils.GenerationMixin.generate`] 메소드를 사용하여 텍스트를 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.
+[`~generation.GenerationMixin.generate`] 메소드를 사용하여 텍스트를 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.

 ```py
 >>> from transformers import AutoModelForCausalLM
--- a/docs/source/ko/tasks/summarization.md
+++ b/docs/source/ko/tasks/summarization.md
@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 <Youtube id="yHnr5Dk2zCI"/>

 요약은 문서나 기사에서 중요한 정보를 모두 포함하되 짧게 만드는 일입니다.
-번역과 마찬가지로, 시퀀스-투-시퀀스 문제로 구성할 수 있는 대표적인 작업 중 하나입니다. 
+번역과 마찬가지로, 시퀀스-투-시퀀스 문제로 구성할 수 있는 대표적인 작업 중 하나입니다.
 요약에는 아래와 같이 유형이 있습니다:

 - 추출(Extractive) 요약: 문서에서 가장 관련성 높은 정보를 추출합니다.
@ -44,7 +44,7 @@ rendered properly in your Markdown viewer.
 pip install transformers datasets evaluate rouge_score
 ```

-Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다. 
+Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다.
 토큰을 입력하여 로그인하세요.

 ```py
@ -114,14 +114,14 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 ...     return model_inputs
 ```

-전체 데이터셋에 전처리 함수를 적용하려면 🤗 Datasets의 [`~datasets.Dataset.map`] 메소드를 사용하세요. 
+전체 데이터셋에 전처리 함수를 적용하려면 🤗 Datasets의 [`~datasets.Dataset.map`] 메소드를 사용하세요.
 `batched=True`로 설정하여 데이터셋의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다.

 ```py
 >>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
 ```

-이제 [`DataCollatorForSeq2Seq`]를 사용하여 예제 배치를 만드세요. 
+이제 [`DataCollatorForSeq2Seq`]를 사용하여 예제 배치를 만드세요.
 전체 데이터셋을 최대 길이로 패딩하는 것보다 배치마다 가장 긴 문장 길이에 맞춰 *동적 패딩*하는 것이 더 효율적입니다.

 <frameworkcontent>
@ -143,9 +143,9 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에

 ## 평가[[evaluate]]

-학습 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 
-🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다. 
-이 작업에서는 [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) 평가 지표를 가져옵니다. 
+학습 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다.
+🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다.
+이 작업에서는 [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) 평가 지표를 가져옵니다.
 (평가 지표를 불러오고 계산하는 방법은 🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요.)

 ```py
@ -196,9 +196,9 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에

 이제 세 단계만 남았습니다:

-1. [`Seq2SeqTrainingArguments`]에서 학습 하이퍼파라미터를 정의하세요. 
-유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. 
-`push_to_hub=True`를 설정하여 이 모델을 Hub에 푸시할 수 있습니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다.) 
+1. [`Seq2SeqTrainingArguments`]에서 학습 하이퍼파라미터를 정의하세요.
+유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다.
+`push_to_hub=True`를 설정하여 이 모델을 Hub에 푸시할 수 있습니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다.)
 [`Trainer`]는 각 에폭이 끝날 때마다 ROUGE 지표를 평가하고 학습 체크포인트를 저장합니다.
 2. 모델, 데이터셋, 토크나이저, 데이터 콜레이터 및 `compute_metrics` 함수와 함께 학습 인수를 [`Seq2SeqTrainer`]에 전달하세요.
 3. [`~Trainer.train`]을 호출하여 모델을 파인튜닝하세요.
@ -285,7 +285,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> model.compile(optimizer=optimizer)
 ```

-학습을 시작하기 전에 설정해야 할 마지막 두 가지는 예측에서 ROUGE 점수를 계산하고 모델을 Hub에 푸시하는 방법을 제공하는 것입니다. 
+학습을 시작하기 전에 설정해야 할 마지막 두 가지는 예측에서 ROUGE 점수를 계산하고 모델을 Hub에 푸시하는 방법을 제공하는 것입니다.
 두 작업 모두 [Keras callbacks](../main_classes/keras_callbacks)으로 수행할 수 있습니다.

 [`~transformers.KerasMetricCallback`]에 `compute_metrics` 함수를 전달하세요:
@ -313,7 +313,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> callbacks = [metric_callback, push_to_hub_callback]
 ```

-드디어 모델 학습을 시작할 준비가 되었습니다! 
+드디어 모델 학습을 시작할 준비가 되었습니다!
 학습 및 검증 데이터셋, 에폭 수 및 콜백과 함께 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하여 모델을 파인튜닝하세요.

 ```py
@ -326,7 +326,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습

 <Tip>

-요약을 위해 모델을 파인튜닝하는 방법에 대한 더 자세한 예제를 보려면 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) 
+요약을 위해 모델을 파인튜닝하는 방법에 대한 더 자세한 예제를 보려면 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
 또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)을 참고하세요.

 </Tip>
@ -341,7 +341,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
 ```

-추론을 위해 파인튜닝한 모델을 시험해 보는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 
+추론을 위해 파인튜닝한 모델을 시험해 보는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다.
 모델을 사용하여 요약을 수행할 [`pipeline`]을 인스턴스화하고 텍스트를 전달하세요:

 ```py
@ -366,7 +366,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

-요약문을 생성하려면 [`~transformers.generation_utils.GenerationMixin.generate`] 메소드를 사용하세요. 
+요약문을 생성하려면 [`~generation.GenerationMixin.generate`] 메소드를 사용하세요.
 텍스트 생성에 대한 다양한 전략과 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [텍스트 생성](../main_classes/text_generation) API를 참조하세요.

 ```py
@ -393,7 +393,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> inputs = tokenizer(text, return_tensors="tf").input_ids
 ```

-요약문을 생성하려면 [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메소드를 사용하세요. 
+요약문을 생성하려면 [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메소드를 사용하세요.
 텍스트 생성에 대한 다양한 전략과 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [텍스트 생성](../main_classes/text_generation) API를 참조하세요.

 ```py
--- a/docs/source/ko/tasks/translation.md
+++ b/docs/source/ko/tasks/translation.md
@ -341,7 +341,7 @@ TensorFlow에서 모델을 파인튜닝하려면 우선 optimizer 함수, 학습
 ```py
 >>> from transformers import pipeline

-# Change `xx` to the language of the input and `yy` to the language of the desired output. 
+# Change `xx` to the language of the input and `yy` to the language of the desired output.
 # Examples: "en" for English, "fr" for French, "de" for German, "es" for Spanish, "zh" for Chinese, etc; translation_en_to_fr translates English to French
 # You can view all the lists of languages here - https://huggingface.co/languages
 >>> translator = pipeline("translation_xx_to_yy", model="my_awesome_opus_books_model")
@ -362,7 +362,7 @@ TensorFlow에서 모델을 파인튜닝하려면 우선 optimizer 함수, 학습
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

-[`~transformers.generation_utils.GenerationMixin.generate`] 메서드로 번역을 생성하세요. 다양한 텍스트 생성 전략 및 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [Text Generation](../main_classes/text_generation) API를 살펴보시기 바랍니다.
+[`~generation.GenerationMixin.generate`] 메서드로 번역을 생성하세요. 다양한 텍스트 생성 전략 및 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [Text Generation](../main_classes/text_generation) API를 살펴보시기 바랍니다.

 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
--- a/docs/source/pt/converting_tensorflow_models.md
+++ b/docs/source/pt/converting_tensorflow_models.md
@ -30,11 +30,11 @@ A documentação abaixo reflete o formato do comando **transformers-cli convert*

 ## BERT

-Você pode converter qualquer checkpoint do BERT em TensorFlow (em particular [os modelos pré-treinados lançados pelo Google](https://github.com/google-research/bert#pre-trained-models)) em um arquivo PyTorch usando um 
+Você pode converter qualquer checkpoint do BERT em TensorFlow (em particular [os modelos pré-treinados lançados pelo Google](https://github.com/google-research/bert#pre-trained-models)) em um arquivo PyTorch usando um
 [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.

 Esta Interface de Linha de Comando (CLI) recebe como entrada um checkpoint do TensorFlow (três arquivos começando com `bert_model.ckpt`) e o
-arquivo de configuração (`bert_config.json`), e então cria um modelo PyTorch para esta configuração, carrega os pesos 
+arquivo de configuração (`bert_config.json`), e então cria um modelo PyTorch para esta configuração, carrega os pesos
 do checkpoint do TensorFlow no modelo PyTorch e salva o modelo resultante em um arquivo PyTorch que pode
 ser importado usando `from_pretrained()` (veja o exemplo em [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).

@ -102,7 +102,7 @@ Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights

-transformers-cli convert --model_type openai-community/gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
--- a/docs/source/zh/main_classes/agent.md
+++ b/docs/source/zh/main_classes/agent.md
@ -18,84 +18,9 @@ rendered properly in your Markdown viewer.

 <Tip warning={true}>

-Transformers Agents是一个实验性的API，它随时可能发生变化。由于API或底层模型容易发生变化，因此由agents返回的结果可能会有所不同。
+The Agents framework has significantly changed in version v4.41.0.
+This document has been removed as it was referencing an older API.

+We eagerly welcome new contributions for the updated API.

 </Tip>
-
-要了解更多关于agents和工具的信息，请确保阅读[介绍指南](../transformers_agents)。此页面包含底层类的API文档。
-
-
-## Agents
-
-我们提供三种类型的agents：[`HfAgent`]使用开源模型的推理端点，[`LocalAgent`]使用您在本地选择的模型，[`OpenAiAgent`]使用OpenAI封闭模型。
-
-
-### HfAgent
-
-[[autodoc]] HfAgent
-
-### LocalAgent
-
-[[autodoc]] LocalAgent
-
-### OpenAiAgent
-
-[[autodoc]] OpenAiAgent
-
-### AzureOpenAiAgent
-
-[[autodoc]] AzureOpenAiAgent
-
-### Agent
-
-[[autodoc]] Agent 
-    - chat 
-    - run 
-    - prepare_for_new_chat
-
-## 工具
-
-### load_tool
-
-[[autodoc]] load_tool
-
-### Tool
-
-[[autodoc]] Tool
-
-### PipelineTool
-
-[[autodoc]] PipelineTool
-
-### RemoteTool
-
-[[autodoc]] RemoteTool
-
-### launch_gradio_demo
-
-[[autodoc]] launch_gradio_demo
-
-## Agent类型
-
-Agents可以处理工具之间任何类型的对象；工具是多模态的，可以接受和返回文本、图像、音频、视频等类型。为了增加工具之间的兼容性，以及正确地在ipython（jupyter、colab、ipython notebooks等）中呈现这些返回值，我们实现了这些类型的包装类。
-
-被包装的对象应该继续按照最初的行为方式运作；文本对象应该仍然像字符串一样运作，图像对象应该仍然像`PIL.Image`一样运作。
-
-这些类型有三个特定目的：
-
- 对类型调用 `to_raw` 应该返回底层对象
- 对类型调用 `to_string` 应该将对象作为字符串返回：在`AgentText`的情况下可能是字符串，但在其他情况下可能是对象序列化版本的路径
- 在ipython内核中显示它应该正确显示对象
-
-### AgentText
-
-[[autodoc]] transformers.tools.agent_types.AgentText
-
-### AgentImage
-
-[[autodoc]] transformers.tools.agent_types.AgentImage
-
-### AgentAudio
-
-[[autodoc]] transformers.tools.agent_types.AgentAudio
--- a/examples/README.md
+++ b/examples/README.md
@ -17,7 +17,7 @@ limitations under the License.

 We host a wide range of example scripts for multiple learning frameworks. Simply choose your favorite: [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch) or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).

-We also have some [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects), as well as some [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy). Note that unlike the main examples these are not actively maintained, and may require specific older versions of dependencies in order to run. 
+We also have some [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects), as well as some [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy). Note that unlike the main examples these are not actively maintained, and may require specific older versions of dependencies in order to run.

 While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data, allowing you to tweak and edit them as required.

@ -97,16 +97,16 @@ and run the example command as usual afterward.

 ## Running the Examples on Remote Hardware with Auto-Setup

-[run_on_remote.py](./run_on_remote.py) is a script that launches any example on remote self-hosted hardware, 
-with automatic hardware and environment setup. It uses [Runhouse](https://github.com/run-house/runhouse) to launch 
-on self-hosted hardware (e.g. in your own cloud account or on-premise cluster) but there are other options 
-for running remotely as well. You can easily customize the example used, command line arguments, dependencies, 
+[run_on_remote.py](./run_on_remote.py) is a script that launches any example on remote self-hosted hardware,
+with automatic hardware and environment setup. It uses [Runhouse](https://github.com/run-house/runhouse) to launch
+on self-hosted hardware (e.g. in your own cloud account or on-premise cluster) but there are other options
+for running remotely as well. You can easily customize the example used, command line arguments, dependencies,
 and type of compute hardware, and then run the script to automatically launch the example.

-You can refer to 
+You can refer to
 [hardware setup](https://runhouse-docs.readthedocs-hosted.com/en/latest/api/python/cluster.html#hardware-setup)
 for more information about hardware and dependency setup with Runhouse, or this
-[Colab tutorial](https://colab.research.google.com/drive/1sh_aNQzJX5BKAdNeXthTNGxKz7sM9VPc) for a more in-depth 
+[Colab tutorial](https://colab.research.google.com/drive/1sh_aNQzJX5BKAdNeXthTNGxKz7sM9VPc) for a more in-depth
 walkthrough.

 You can run the script with the following commands:
@ -118,7 +118,7 @@ pip install runhouse
 # For an on-demand V100 with whichever cloud provider you have configured:
 python run_on_remote.py \
    --example pytorch/text-generation/run_generation.py \
-    --model_type=openai-community/gpt2 \
+    --model_type=gpt2 \
    --model_name_or_path=openai-community/gpt2 \
    --prompt "I am a language model and"

--- a/Show More
+++ b/Show More