mirror of
				https://github.com/huggingface/transformers.git
				synced 2025-10-25 12:44:35 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			feat/conti
			...
			update-tp-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 7f5932d0ee | 
| @ -7,18 +7,6 @@ parameters: | ||||
|     nightly: | ||||
|         type: boolean | ||||
|         default: false | ||||
|     GHA_Actor: | ||||
|         type: string | ||||
|         default: "" | ||||
|     GHA_Action: | ||||
|         type: string | ||||
|         default: "" | ||||
|     GHA_Event: | ||||
|         type: string | ||||
|         default: "" | ||||
|     GHA_Meta: | ||||
|         type: string | ||||
|         default: "" | ||||
|  | ||||
| jobs: | ||||
|     # Ensure running with CircleCI/huggingface | ||||
| @ -112,6 +100,8 @@ jobs: | ||||
|  | ||||
|             - run: | ||||
|                 name: "Retrieve Artifact Paths" | ||||
|                 env: | ||||
|                     CIRCLE_TOKEN: ${{ secrets.CI_ARTIFACT_TOKEN }} | ||||
|                 command: | | ||||
|                     project_slug="gh/${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}" | ||||
|                     job_number=${CIRCLE_BUILD_NUM} | ||||
| @ -156,7 +146,7 @@ jobs: | ||||
|                   path: ~/transformers/installed.txt | ||||
|             - run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1) | ||||
|             - run: ruff check examples tests src utils | ||||
|             - run: ruff format examples tests src utils --check | ||||
|             - run: ruff format tests src utils --check | ||||
|             - run: python utils/custom_init_isort.py --check_only | ||||
|             - run: python utils/sort_auto_mappings.py --check_only | ||||
|             - run: python utils/check_doc_toc.py | ||||
| @ -180,7 +170,8 @@ jobs: | ||||
|             - store_artifacts: | ||||
|                   path: ~/transformers/installed.txt | ||||
|             - run: python utils/check_copies.py | ||||
|             - run: python utils/check_modular_conversion.py | ||||
|             - run: python utils/check_modular_conversion.py --num_workers 4 | ||||
|             - run: python utils/check_table.py | ||||
|             - run: python utils/check_dummies.py | ||||
|             - run: python utils/check_repo.py | ||||
|             - run: python utils/check_inits.py | ||||
| @ -190,6 +181,7 @@ jobs: | ||||
|             - run: make deps_table_check_updated | ||||
|             - run: python utils/update_metadata.py --check-only | ||||
|             - run: python utils/check_docstrings.py | ||||
|             - run: python utils/check_support_list.py | ||||
|  | ||||
| workflows: | ||||
|     version: 2 | ||||
|  | ||||
| @ -28,32 +28,11 @@ COMMON_ENV_VARIABLES = { | ||||
|     "TRANSFORMERS_IS_CI": True, | ||||
|     "PYTEST_TIMEOUT": 120, | ||||
|     "RUN_PIPELINE_TESTS": False, | ||||
|     # will be adjust in `CircleCIJob.to_dict`. | ||||
|     "RUN_FLAKY": True, | ||||
| } | ||||
| # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical | ||||
| COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None} | ||||
| COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None} | ||||
| DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}] | ||||
|  | ||||
| # Strings that commonly appear in the output of flaky tests when they fail. These are used with `pytest-rerunfailures` | ||||
| # to rerun the tests that match these patterns. | ||||
| FLAKY_TEST_FAILURE_PATTERNS = [ | ||||
|     "OSError",  # Machine/connection transient error | ||||
|     "Timeout",  # Machine/connection transient error | ||||
|     "ConnectionError",  # Connection transient error | ||||
|     "FileNotFoundError",  # Raised by `datasets` on Hub failures | ||||
|     "PIL.UnidentifiedImageError",  # Raised by `PIL.Image.open` on connection issues | ||||
|     "HTTPError",  # Also catches HfHubHTTPError | ||||
|     "AssertionError: Tensor-likes are not close!",  # `torch.testing.assert_close`, we might have unlucky random values | ||||
|     # TODO: error downloading tokenizer's `merged.txt` from hub can cause all the exceptions below. Throw and handle | ||||
|     # them under a single message. | ||||
|     "TypeError: expected str, bytes or os.PathLike object, not NoneType", | ||||
|     "TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType", | ||||
|     "Converting from Tiktoken failed", | ||||
|     "KeyError: <class ", | ||||
|     "TypeError: not a string", | ||||
| ] | ||||
|  | ||||
|  | ||||
| class EmptyJob: | ||||
|     job_name = "empty" | ||||
| @ -128,8 +107,6 @@ class CircleCIJob: | ||||
|  | ||||
|     def to_dict(self): | ||||
|         env = COMMON_ENV_VARIABLES.copy() | ||||
|         # Do not run tests decorated by @is_flaky on pull requests | ||||
|         env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == "" | ||||
|         env.update(self.additional_env) | ||||
|  | ||||
|         job = { | ||||
| @ -147,9 +124,7 @@ class CircleCIJob: | ||||
|                 # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues | ||||
|         timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else "" | ||||
|         marker_cmd = f"-m '{self.marker}'" if self.marker is not None else "" | ||||
|         junit_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml" | ||||
|         joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS) | ||||
|         repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'" | ||||
|         additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml" | ||||
|         parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> ' | ||||
|         steps = [ | ||||
|             "checkout", | ||||
| @ -175,10 +150,9 @@ class CircleCIJob: | ||||
|                     "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt" | ||||
|                     } | ||||
|             }, | ||||
|             {"run": {"name": "fetch hub objects before pytest", "command": "python3 utils/fetch_hub_objects_for_ci.py"}}, | ||||
|             {"run": { | ||||
|                 "name": "Run tests", | ||||
|                 "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"} | ||||
|                 "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"} | ||||
|             }, | ||||
|             {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}}, | ||||
|             {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}}, | ||||
| @ -211,9 +185,6 @@ torch_job = CircleCIJob( | ||||
| generate_job = CircleCIJob( | ||||
|     "generate", | ||||
|     docker_image=[{"image": "huggingface/transformers-torch-light"}], | ||||
|     # networkx==3.3 (after #36957) cause some issues | ||||
|     # TODO: remove this once it works directly | ||||
|     install_steps=["uv venv && uv pip install ."], | ||||
|     marker="generate", | ||||
|     parallelism=6, | ||||
| ) | ||||
| @ -277,7 +248,6 @@ examples_torch_job = CircleCIJob( | ||||
|     docker_image=[{"image":"huggingface/transformers-examples-torch"}], | ||||
|     # TODO @ArthurZucker remove this once docker is easier to build | ||||
|     install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"], | ||||
|     pytest_num_workers=4, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @ -285,7 +255,6 @@ examples_tensorflow_job = CircleCIJob( | ||||
|     "examples_tensorflow", | ||||
|     additional_env={"OMP_NUM_THREADS": 8}, | ||||
|     docker_image=[{"image":"huggingface/transformers-examples-tf"}], | ||||
|     pytest_num_workers=2, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @ -309,7 +278,7 @@ onnx_job = CircleCIJob( | ||||
|     docker_image=[{"image":"huggingface/transformers-torch-tf-light"}], | ||||
|     install_steps=[ | ||||
|         "uv venv", | ||||
|         "uv pip install .[testing,sentencepiece,onnxruntime,vision,rjieba]", | ||||
|         "uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]", | ||||
|     ], | ||||
|     pytest_options={"k onnx": None}, | ||||
|     pytest_num_workers=1, | ||||
| @ -336,9 +305,6 @@ repo_utils_job = CircleCIJob( | ||||
| non_model_job = CircleCIJob( | ||||
|     "non_model", | ||||
|     docker_image=[{"image": "huggingface/transformers-torch-light"}], | ||||
|     # networkx==3.3 (after #36957) cause some issues | ||||
|     # TODO: remove this once it works directly | ||||
|     install_steps=["uv venv && uv pip install ."], | ||||
|     marker="not generate", | ||||
|     parallelism=6, | ||||
| ) | ||||
| @ -368,9 +334,9 @@ doc_test_job = CircleCIJob( | ||||
|     pytest_num_workers=1, | ||||
| ) | ||||
|  | ||||
| REGULAR_TESTS = [torch_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip | ||||
| EXAMPLES_TESTS = [examples_torch_job] | ||||
| PIPELINE_TESTS = [pipelines_torch_job] | ||||
| REGULAR_TESTS = [torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip | ||||
| EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job] | ||||
| PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job] | ||||
| REPO_UTIL_TESTS = [repo_utils_job] | ||||
| DOC_TESTS = [doc_test_job] | ||||
| ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip | ||||
| @ -397,12 +363,7 @@ def create_circleci_config(folder=None): | ||||
|         "parameters": { | ||||
|             # Only used to accept the parameters from the trigger | ||||
|             "nightly": {"type": "boolean", "default": False}, | ||||
|             # Only used to accept the parameters from GitHub Actions trigger | ||||
|             "GHA_Actor": {"type": "string", "default": ""}, | ||||
|             "GHA_Action": {"type": "string", "default": ""}, | ||||
|             "GHA_Event": {"type": "string", "default": ""}, | ||||
|             "GHA_Meta": {"type": "string", "default": ""}, | ||||
|             "tests_to_run": {"type": "string", "default": ""}, | ||||
|             "tests_to_run": {"type": "string", "default": ''}, | ||||
|             **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}, | ||||
|             **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs}, | ||||
|         }, | ||||
|  | ||||
							
								
								
									
										18
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										18
									
								
								.github/ISSUE_TEMPLATE/bug-report.yml
									
									
									
									
										vendored
									
									
								
							| @ -16,7 +16,7 @@ body: | ||||
|     id: system-info | ||||
|     attributes: | ||||
|       label: System Info | ||||
|       description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below. | ||||
|       description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below. | ||||
|       placeholder: transformers version, platform, python version, ... | ||||
|     validations: | ||||
|       required: true | ||||
| @ -38,31 +38,25 @@ body: | ||||
|  | ||||
|           - text models: @ArthurZucker | ||||
|           - vision models: @amyeroberts, @qubvel | ||||
|           - speech models: @eustlb | ||||
|           - speech models: @ylacombe, @eustlb | ||||
|           - graph models: @clefourrier | ||||
|  | ||||
|         Library: | ||||
|  | ||||
|           - flax: @gante and @Rocketknight1 | ||||
|           - flax: @sanchit-gandhi | ||||
|           - generate: @zucchini-nlp (visual-language models) or @gante (all others) | ||||
|           - pipelines: @Rocketknight1 | ||||
|           - tensorflow: @gante and @Rocketknight1 | ||||
|           - tokenizers: @ArthurZucker and @itazap | ||||
|           - trainer: @zach-huggingface @SunMarc | ||||
|           - trainer: @muellerzr @SunMarc | ||||
|  | ||||
|         Integrations: | ||||
|  | ||||
|           - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface | ||||
|           - deepspeed: HF Trainer/Accelerate: @muellerzr | ||||
|           - ray/raytune: @richardliaw, @amogkam | ||||
|           - Big Model Inference: @SunMarc | ||||
|           - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber | ||||
|  | ||||
|         Devices/Backends: | ||||
|          | ||||
|           - AMD ROCm: @ivarflakstad | ||||
|           - Intel XPU: @IlyasMoutawwakil | ||||
|           - Ascend NPU: @ivarflakstad  | ||||
|  | ||||
|         Documentation: @stevhliu | ||||
|  | ||||
|         Model hub: | ||||
| @ -78,7 +72,7 @@ body: | ||||
|  | ||||
|         Maintained examples (not research project or legacy): | ||||
|  | ||||
|           - Flax: @Rocketknight1 | ||||
|           - Flax: @sanchit-gandhi | ||||
|           - PyTorch: See Models above and tag the person corresponding to the modality of the example. | ||||
|           - TensorFlow: @Rocketknight1 | ||||
|  | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/i18n.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/i18n.md
									
									
									
									
										vendored
									
									
								
							| @ -23,7 +23,7 @@ Some notes: | ||||
| * Please translate in a gender-neutral way. | ||||
| * Add your translations to the folder called `<languageCode>` inside the [source folder](https://github.com/huggingface/transformers/tree/main/docs/source). | ||||
| * Register your translation in `<languageCode>/_toctree.yml`; please follow the order of the [English version](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml). | ||||
| * Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu for review. | ||||
| * Once you're finished, open a pull request and tag this issue by including #issue-number in the description, where issue-number is the number of this issue. Please ping @stevhliu and @MKhalusova for review. | ||||
| * 🙋 If you'd like others to help you with the translation, you can also post in the 🤗 [forums](https://discuss.huggingface.co/). | ||||
|  | ||||
| ## Get Started section | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/migration.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/migration.yml
									
									
									
									
										vendored
									
									
								
							| @ -6,7 +6,7 @@ body: | ||||
|     id: system-info | ||||
|     attributes: | ||||
|       label: System Info | ||||
|       description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below. | ||||
|       description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below. | ||||
|       render: shell | ||||
|       placeholder: transformers version, platform, python version, ... | ||||
|     validations: | ||||
|  | ||||
							
								
								
									
										10
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							| @ -41,22 +41,22 @@ Models: | ||||
|  | ||||
| - text models: @ArthurZucker | ||||
| - vision models: @amyeroberts, @qubvel | ||||
| - speech models: @eustlb | ||||
| - speech models: @ylacombe, @eustlb | ||||
| - graph models: @clefourrier | ||||
|  | ||||
| Library: | ||||
|  | ||||
| - flax: @gante and @Rocketknight1 | ||||
| - flax: @sanchit-gandhi | ||||
| - generate: @zucchini-nlp (visual-language models) or @gante (all others) | ||||
| - pipelines: @Rocketknight1 | ||||
| - tensorflow: @gante and @Rocketknight1 | ||||
| - tokenizers: @ArthurZucker | ||||
| - trainer: @zach-huggingface and @SunMarc | ||||
| - trainer: @muellerzr and @SunMarc | ||||
| - chat templates: @Rocketknight1 | ||||
|  | ||||
| Integrations: | ||||
|  | ||||
| - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface | ||||
| - deepspeed: HF Trainer/Accelerate: @muellerzr | ||||
| - ray/raytune: @richardliaw, @amogkam | ||||
| - Big Model Inference: @SunMarc | ||||
| - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber | ||||
| @ -72,7 +72,7 @@ HF projects: | ||||
|  | ||||
| Maintained examples (not research project or legacy): | ||||
|  | ||||
| - Flax: @Rocketknight1 | ||||
| - Flax: @sanchit-gandhi | ||||
| - PyTorch: See Models above and tag the person corresponding to the modality of the example. | ||||
| - TensorFlow: @Rocketknight1 | ||||
|  | ||||
|  | ||||
							
								
								
									
										120
									
								
								.github/scripts/assign_reviewers.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										120
									
								
								.github/scripts/assign_reviewers.py
									
									
									
									
										vendored
									
									
								
							| @ -1,120 +0,0 @@ | ||||
| # coding=utf-8 | ||||
| # Copyright 2025 the HuggingFace Inc. team. All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
|  | ||||
| import os | ||||
| import github | ||||
| import json | ||||
| from github import Github | ||||
| import re | ||||
| from collections import Counter | ||||
| from pathlib import Path | ||||
|  | ||||
| def pattern_to_regex(pattern): | ||||
|     if pattern.startswith("/"): | ||||
|         start_anchor = True | ||||
|         pattern = re.escape(pattern[1:]) | ||||
|     else: | ||||
|         start_anchor = False | ||||
|         pattern = re.escape(pattern) | ||||
|     # Replace `*` with "any number of non-slash characters" | ||||
|     pattern = pattern.replace(r"\*", "[^/]*") | ||||
|     if start_anchor: | ||||
|         pattern = r"^\/?" + pattern  # Allow an optional leading slash after the start of the string | ||||
|     return pattern | ||||
|  | ||||
| def get_file_owners(file_path, codeowners_lines): | ||||
|     # Process lines in reverse (last matching pattern takes precedence) | ||||
|     for line in reversed(codeowners_lines): | ||||
|         # Skip comments and empty lines, strip inline comments | ||||
|         line = line.split('#')[0].strip() | ||||
|         if not line: | ||||
|             continue | ||||
|  | ||||
|         # Split into pattern and owners | ||||
|         parts = line.split() | ||||
|         pattern = parts[0] | ||||
|         # Can be empty, e.g. for dummy files with explicitly no owner! | ||||
|         owners = [owner.removeprefix("@") for owner in parts[1:]] | ||||
|  | ||||
|         # Check if file matches pattern | ||||
|         file_regex = pattern_to_regex(pattern) | ||||
|         if re.search(file_regex, file_path) is not None: | ||||
|             return owners  # Remember, can still be empty! | ||||
|     return []  # Should never happen, but just in case | ||||
|  | ||||
| def pr_author_is_in_hf(pr_author, codeowners_lines): | ||||
|     # Check if the PR author is in the codeowners file | ||||
|     for line in codeowners_lines: | ||||
|         line = line.split('#')[0].strip() | ||||
|         if not line: | ||||
|             continue | ||||
|  | ||||
|         # Split into pattern and owners | ||||
|         parts = line.split() | ||||
|         owners = [owner.removeprefix("@") for owner in parts[1:]] | ||||
|  | ||||
|         if pr_author in owners: | ||||
|             return True | ||||
|     return False | ||||
|  | ||||
| def main(): | ||||
|     script_dir = Path(__file__).parent.absolute() | ||||
|     with open(script_dir / "codeowners_for_review_action") as f: | ||||
|         codeowners_lines = f.readlines() | ||||
|  | ||||
|     g = Github(os.environ['GITHUB_TOKEN']) | ||||
|     repo = g.get_repo("huggingface/transformers") | ||||
|     with open(os.environ['GITHUB_EVENT_PATH']) as f: | ||||
|         event = json.load(f) | ||||
|  | ||||
|     # The PR number is available in the event payload | ||||
|     pr_number = event['pull_request']['number'] | ||||
|     pr = repo.get_pull(pr_number) | ||||
|     pr_author = pr.user.login | ||||
|     if pr_author_is_in_hf(pr_author, codeowners_lines): | ||||
|         print(f"PR author {pr_author} is in codeowners, skipping review request.") | ||||
|         return | ||||
|  | ||||
|     existing_reviews = list(pr.get_reviews()) | ||||
|     if existing_reviews: | ||||
|         print(f"Already has reviews: {[r.user.login for r in existing_reviews]}") | ||||
|         return | ||||
|  | ||||
|     users_requested, teams_requested = pr.get_review_requests() | ||||
|     users_requested = list(users_requested) | ||||
|     if users_requested: | ||||
|         print(f"Reviewers already requested: {users_requested}") | ||||
|         return | ||||
|  | ||||
|     locs_per_owner = Counter() | ||||
|     for file in pr.get_files(): | ||||
|         owners = get_file_owners(file.filename, codeowners_lines) | ||||
|         for owner in owners: | ||||
|             locs_per_owner[owner] += file.changes | ||||
|  | ||||
|     # Assign the top 2 based on locs changed as reviewers, but skip the owner if present | ||||
|     locs_per_owner.pop(pr_author, None) | ||||
|     top_owners = locs_per_owner.most_common(2) | ||||
|     print("Top owners", top_owners) | ||||
|     top_owners = [owner[0] for owner in top_owners] | ||||
|     try: | ||||
|         pr.create_review_request(top_owners) | ||||
|     except github.GithubException as e: | ||||
|         print(f"Failed to request review for {top_owners}: {e}") | ||||
|  | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										370
									
								
								.github/scripts/codeowners_for_review_action
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										370
									
								
								.github/scripts/codeowners_for_review_action
									
									
									
									
										vendored
									
									
								
							| @ -1,370 +0,0 @@ | ||||
| # Top-level rules are matched only if nothing else matches | ||||
| * @Rocketknight1 @ArthurZucker # if no one is pinged based on the other rules, he will do the dispatch | ||||
| *.md @stevhliu | ||||
| *tokenization* @ArthurZucker | ||||
| docs/ @stevhliu | ||||
| /benchmark/ @McPatate | ||||
| /docker/ @ydshieh @ArthurZucker | ||||
|  | ||||
| # More high-level globs catch cases when specific rules later don't apply | ||||
| /src/transformers/models/*/processing* @molbap @yonigozlan @qubvel | ||||
| /src/transformers/models/*/image_processing* @qubvel | ||||
| /src/transformers/models/*/image_processing_*_fast* @yonigozlan | ||||
|  | ||||
| # Owners of subsections of the library | ||||
| /src/transformers/generation/ @gante | ||||
| /src/transformers/pipeline/ @Rocketknight1 @yonigozlan | ||||
| /src/transformers/integrations/ @SunMarc @MekkCyber @zach-huggingface | ||||
| /src/transformers/quantizers/ @SunMarc @MekkCyber | ||||
| tests/ @ydshieh | ||||
| tests/generation/ @gante | ||||
|  | ||||
| /src/transformers/models/auto/ @ArthurZucker | ||||
| /src/transformers/utils/ @ArthurZucker @Rocketknight1 | ||||
| /src/transformers/loss/ @ArthurZucker | ||||
| /src/transformers/onnx/ @michaelbenayoun | ||||
|  | ||||
| # Specific files come after the sections/globs, so they take priority | ||||
| /.circleci/config.yml @ArthurZucker @ydshieh | ||||
| /utils/tests_fetcher.py @ydshieh | ||||
| trainer.py @zach-huggingface @SunMarc | ||||
| trainer_utils.py @zach-huggingface @SunMarc | ||||
| /utils/modular_model_converter.py @Cyrilvallez @ArthurZucker | ||||
|  | ||||
| # Owners of individual models are specific / high priority, and so they come last | ||||
| # mod* captures modeling and modular files | ||||
|  | ||||
| # Text models | ||||
| /src/transformers/models/albert/mod*_albert* @ArthurZucker | ||||
| /src/transformers/models/bamba/mod*_bamba* @ArthurZucker | ||||
| /src/transformers/models/bart/mod*_bart* @ArthurZucker | ||||
| /src/transformers/models/barthez/mod*_barthez* @ArthurZucker | ||||
| /src/transformers/models/bartpho/mod*_bartpho* @ArthurZucker | ||||
| /src/transformers/models/bert/mod*_bert* @ArthurZucker | ||||
| /src/transformers/models/bert_generation/mod*_bert_generation* @ArthurZucker | ||||
| /src/transformers/models/bert_japanese/mod*_bert_japanese* @ArthurZucker | ||||
| /src/transformers/models/bertweet/mod*_bertweet* @ArthurZucker | ||||
| /src/transformers/models/big_bird/mod*_big_bird* @ArthurZucker | ||||
| /src/transformers/models/bigbird_pegasus/mod*_bigbird_pegasus* @ArthurZucker | ||||
| /src/transformers/models/biogpt/mod*_biogpt* @ArthurZucker | ||||
| /src/transformers/models/blenderbot/mod*_blenderbot* @ArthurZucker | ||||
| /src/transformers/models/blenderbot_small/mod*_blenderbot_small* @ArthurZucker | ||||
| /src/transformers/models/bloom/mod*_bloom* @ArthurZucker | ||||
| /src/transformers/models/bort/mod*_bort* @ArthurZucker | ||||
| /src/transformers/models/byt5/mod*_byt5* @ArthurZucker | ||||
| /src/transformers/models/camembert/mod*_camembert* @ArthurZucker | ||||
| /src/transformers/models/canine/mod*_canine* @ArthurZucker | ||||
| /src/transformers/models/codegen/mod*_codegen* @ArthurZucker | ||||
| /src/transformers/models/code_llama/mod*_code_llama* @ArthurZucker | ||||
| /src/transformers/models/cohere/mod*_cohere* @ArthurZucker | ||||
| /src/transformers/models/cohere2/mod*_cohere2* @ArthurZucker | ||||
| /src/transformers/models/convbert/mod*_convbert* @ArthurZucker | ||||
| /src/transformers/models/cpm/mod*_cpm* @ArthurZucker | ||||
| /src/transformers/models/cpmant/mod*_cpmant* @ArthurZucker | ||||
| /src/transformers/models/ctrl/mod*_ctrl* @ArthurZucker | ||||
| /src/transformers/models/dbrx/mod*_dbrx* @ArthurZucker | ||||
| /src/transformers/models/deberta/mod*_deberta* @ArthurZucker | ||||
| /src/transformers/models/deberta_v2/mod*_deberta_v2* @ArthurZucker | ||||
| /src/transformers/models/dialogpt/mod*_dialogpt* @ArthurZucker | ||||
| /src/transformers/models/diffllama/mod*_diffllama* @ArthurZucker | ||||
| /src/transformers/models/distilbert/mod*_distilbert* @ArthurZucker | ||||
| /src/transformers/models/dpr/mod*_dpr* @ArthurZucker | ||||
| /src/transformers/models/electra/mod*_electra* @ArthurZucker | ||||
| /src/transformers/models/encoder_decoder/mod*_encoder_decoder* @ArthurZucker | ||||
| /src/transformers/models/ernie/mod*_ernie* @ArthurZucker | ||||
| /src/transformers/models/ernie_m/mod*_ernie_m* @ArthurZucker | ||||
| /src/transformers/models/esm/mod*_esm* @ArthurZucker | ||||
| /src/transformers/models/falcon/mod*_falcon* @ArthurZucker | ||||
| /src/transformers/models/falcon3/mod*_falcon3* @ArthurZucker | ||||
| /src/transformers/models/falcon_mamba/mod*_falcon_mamba* @ArthurZucker | ||||
| /src/transformers/models/fastspeech2_conformer/mod*_fastspeech2_conformer* @ArthurZucker | ||||
| /src/transformers/models/flan_t5/mod*_flan_t5* @ArthurZucker | ||||
| /src/transformers/models/flan_ul2/mod*_flan_ul2* @ArthurZucker | ||||
| /src/transformers/models/flaubert/mod*_flaubert* @ArthurZucker | ||||
| /src/transformers/models/fnet/mod*_fnet* @ArthurZucker | ||||
| /src/transformers/models/fsmt/mod*_fsmt* @ArthurZucker | ||||
| /src/transformers/models/funnel/mod*_funnel* @ArthurZucker | ||||
| /src/transformers/models/fuyu/mod*_fuyu* @ArthurZucker | ||||
| /src/transformers/models/gemma/mod*_gemma* @ArthurZucker | ||||
| /src/transformers/models/gemma2/mod*_gemma2* @ArthurZucker | ||||
| /src/transformers/models/glm/mod*_glm* @ArthurZucker | ||||
| /src/transformers/models/openai_gpt/mod*_openai_gpt* @ArthurZucker | ||||
| /src/transformers/models/gpt_neo/mod*_gpt_neo* @ArthurZucker | ||||
| /src/transformers/models/gpt_neox/mod*_gpt_neox* @ArthurZucker | ||||
| /src/transformers/models/gpt_neox_japanese/mod*_gpt_neox_japanese* @ArthurZucker | ||||
| /src/transformers/models/gptj/mod*_gptj* @ArthurZucker | ||||
| /src/transformers/models/gpt2/mod*_gpt2* @ArthurZucker | ||||
| /src/transformers/models/gpt_bigcode/mod*_gpt_bigcode* @ArthurZucker | ||||
| /src/transformers/models/gptsan_japanese/mod*_gptsan_japanese* @ArthurZucker | ||||
| /src/transformers/models/gpt_sw3/mod*_gpt_sw3* @ArthurZucker | ||||
| /src/transformers/models/granite/mod*_granite* @ArthurZucker | ||||
| /src/transformers/models/granitemoe/mod*_granitemoe* @ArthurZucker | ||||
| /src/transformers/models/herbert/mod*_herbert* @ArthurZucker | ||||
| /src/transformers/models/ibert/mod*_ibert* @ArthurZucker | ||||
| /src/transformers/models/jamba/mod*_jamba* @ArthurZucker | ||||
| /src/transformers/models/jetmoe/mod*_jetmoe* @ArthurZucker | ||||
| /src/transformers/models/jukebox/mod*_jukebox* @ArthurZucker | ||||
| /src/transformers/models/led/mod*_led* @ArthurZucker | ||||
| /src/transformers/models/llama/mod*_llama* @ArthurZucker @Cyrilvallez | ||||
| /src/transformers/models/longformer/mod*_longformer* @ArthurZucker | ||||
| /src/transformers/models/longt5/mod*_longt5* @ArthurZucker | ||||
| /src/transformers/models/luke/mod*_luke* @ArthurZucker | ||||
| /src/transformers/models/m2m_100/mod*_m2m_100* @ArthurZucker | ||||
| /src/transformers/models/madlad_400/mod*_madlad_400* @ArthurZucker | ||||
| /src/transformers/models/mamba/mod*_mamba* @ArthurZucker | ||||
| /src/transformers/models/mamba2/mod*_mamba2* @ArthurZucker | ||||
| /src/transformers/models/marian/mod*_marian* @ArthurZucker | ||||
| /src/transformers/models/markuplm/mod*_markuplm* @ArthurZucker | ||||
| /src/transformers/models/mbart/mod*_mbart* @ArthurZucker | ||||
| /src/transformers/models/mega/mod*_mega* @ArthurZucker | ||||
| /src/transformers/models/megatron_bert/mod*_megatron_bert* @ArthurZucker | ||||
| /src/transformers/models/megatron_gpt2/mod*_megatron_gpt2* @ArthurZucker | ||||
| /src/transformers/models/mistral/mod*_mistral* @ArthurZucker | ||||
| /src/transformers/models/mixtral/mod*_mixtral* @ArthurZucker | ||||
| /src/transformers/models/mluke/mod*_mluke* @ArthurZucker | ||||
| /src/transformers/models/mobilebert/mod*_mobilebert* @ArthurZucker | ||||
| /src/transformers/models/modernbert/mod*_modernbert* @ArthurZucker | ||||
| /src/transformers/models/mpnet/mod*_mpnet* @ArthurZucker | ||||
| /src/transformers/models/mpt/mod*_mpt* @ArthurZucker | ||||
| /src/transformers/models/mra/mod*_mra* @ArthurZucker | ||||
| /src/transformers/models/mt5/mod*_mt5* @ArthurZucker | ||||
| /src/transformers/models/mvp/mod*_mvp* @ArthurZucker | ||||
| /src/transformers/models/myt5/mod*_myt5* @ArthurZucker | ||||
| /src/transformers/models/nemotron/mod*_nemotron* @ArthurZucker | ||||
| /src/transformers/models/nezha/mod*_nezha* @ArthurZucker | ||||
| /src/transformers/models/nllb/mod*_nllb* @ArthurZucker | ||||
| /src/transformers/models/nllb_moe/mod*_nllb_moe* @ArthurZucker | ||||
| /src/transformers/models/nystromformer/mod*_nystromformer* @ArthurZucker | ||||
| /src/transformers/models/olmo/mod*_olmo* @ArthurZucker | ||||
| /src/transformers/models/olmo2/mod*_olmo2* @ArthurZucker | ||||
| /src/transformers/models/olmoe/mod*_olmoe* @ArthurZucker | ||||
| /src/transformers/models/open_llama/mod*_open_llama* @ArthurZucker | ||||
| /src/transformers/models/opt/mod*_opt* @ArthurZucker | ||||
| /src/transformers/models/pegasus/mod*_pegasus* @ArthurZucker | ||||
| /src/transformers/models/pegasus_x/mod*_pegasus_x* @ArthurZucker | ||||
| /src/transformers/models/persimmon/mod*_persimmon* @ArthurZucker | ||||
| /src/transformers/models/phi/mod*_phi* @ArthurZucker | ||||
| /src/transformers/models/phi3/mod*_phi3* @ArthurZucker | ||||
| /src/transformers/models/phimoe/mod*_phimoe* @ArthurZucker | ||||
| /src/transformers/models/phobert/mod*_phobert* @ArthurZucker | ||||
| /src/transformers/models/plbart/mod*_plbart* @ArthurZucker | ||||
| /src/transformers/models/prophetnet/mod*_prophetnet* @ArthurZucker | ||||
| /src/transformers/models/qdqbert/mod*_qdqbert* @ArthurZucker | ||||
| /src/transformers/models/qwen2/mod*_qwen2* @ArthurZucker | ||||
| /src/transformers/models/qwen2_moe/mod*_qwen2_moe* @ArthurZucker | ||||
| /src/transformers/models/rag/mod*_rag* @ArthurZucker | ||||
| /src/transformers/models/realm/mod*_realm* @ArthurZucker | ||||
| /src/transformers/models/recurrent_gemma/mod*_recurrent_gemma* @ArthurZucker | ||||
| /src/transformers/models/reformer/mod*_reformer* @ArthurZucker | ||||
| /src/transformers/models/rembert/mod*_rembert* @ArthurZucker | ||||
| /src/transformers/models/retribert/mod*_retribert* @ArthurZucker | ||||
| /src/transformers/models/roberta/mod*_roberta* @ArthurZucker | ||||
| /src/transformers/models/roberta_prelayernorm/mod*_roberta_prelayernorm* @ArthurZucker | ||||
| /src/transformers/models/roc_bert/mod*_roc_bert* @ArthurZucker | ||||
| /src/transformers/models/roformer/mod*_roformer* @ArthurZucker | ||||
| /src/transformers/models/rwkv/mod*_rwkv* @ArthurZucker | ||||
| /src/transformers/models/splinter/mod*_splinter* @ArthurZucker | ||||
| /src/transformers/models/squeezebert/mod*_squeezebert* @ArthurZucker | ||||
| /src/transformers/models/stablelm/mod*_stablelm* @ArthurZucker | ||||
| /src/transformers/models/starcoder2/mod*_starcoder2* @ArthurZucker | ||||
| /src/transformers/models/switch_transformers/mod*_switch_transformers* @ArthurZucker | ||||
| /src/transformers/models/t5/mod*_t5* @ArthurZucker | ||||
| /src/transformers/models/t5v1.1/mod*_t5v1.1* @ArthurZucker | ||||
| /src/transformers/models/tapex/mod*_tapex* @ArthurZucker | ||||
| /src/transformers/models/transfo_xl/mod*_transfo_xl* @ArthurZucker | ||||
| /src/transformers/models/ul2/mod*_ul2* @ArthurZucker | ||||
| /src/transformers/models/umt5/mod*_umt5* @ArthurZucker | ||||
| /src/transformers/models/xmod/mod*_xmod* @ArthurZucker | ||||
| /src/transformers/models/xglm/mod*_xglm* @ArthurZucker | ||||
| /src/transformers/models/xlm/mod*_xlm* @ArthurZucker | ||||
| /src/transformers/models/xlm_prophetnet/mod*_xlm_prophetnet* @ArthurZucker | ||||
| /src/transformers/models/xlm_roberta/mod*_xlm_roberta* @ArthurZucker | ||||
| /src/transformers/models/xlm_roberta_xl/mod*_xlm_roberta_xl* @ArthurZucker | ||||
| /src/transformers/models/xlm_v/mod*_xlm_v* @ArthurZucker | ||||
| /src/transformers/models/xlnet/mod*_xlnet* @ArthurZucker | ||||
| /src/transformers/models/yoso/mod*_yoso* @ArthurZucker | ||||
| /src/transformers/models/zamba/mod*_zamba* @ArthurZucker | ||||
|  | ||||
| # Vision models | ||||
| /src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel | ||||
| /src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel | ||||
| /src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel | ||||
| /src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel | ||||
| /src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel | ||||
| /src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel | ||||
| /src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel | ||||
| /src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel | ||||
| /src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel | ||||
| /src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel | ||||
| /src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel | ||||
| /src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel | ||||
| /src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel | ||||
| /src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel | ||||
| /src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel | ||||
| /src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel | ||||
| /src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel | ||||
| /src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel | ||||
| /src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel | ||||
| /src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel | ||||
| /src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel | ||||
| /src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel | ||||
| /src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel | ||||
| /src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel | ||||
| /src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel | ||||
| /src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel | ||||
| /src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel | ||||
| /src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel | ||||
| /src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel | ||||
| /src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel | ||||
| /src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel | ||||
| /src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel | ||||
| /src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel | ||||
| /src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel | ||||
| /src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel | ||||
| /src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel | ||||
| /src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel | ||||
| /src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel | ||||
| /src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel | ||||
| /src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel | ||||
| /src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel | ||||
| /src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel | ||||
| /src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel | ||||
| /src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel | ||||
| /src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel | ||||
| /src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel | ||||
| /src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel | ||||
| /src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel | ||||
| /src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel | ||||
| /src/transformers/models/van/mod*_van* @amyeroberts @qubvel | ||||
| /src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel | ||||
| /src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel | ||||
| /src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel | ||||
| /src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel | ||||
| /src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel | ||||
| /src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel | ||||
| /src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel | ||||
| /src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel | ||||
| /src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel | ||||
|  | ||||
| # Audio models | ||||
| /src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb | ||||
| /src/transformers/models/bark/mod*_bark* @eustlb | ||||
| /src/transformers/models/clap/mod*_clap* @eustlb | ||||
| /src/transformers/models/dac/mod*_dac* @eustlb | ||||
| /src/transformers/models/encodec/mod*_encodec* @eustlb | ||||
| /src/transformers/models/hubert/mod*_hubert* @eustlb | ||||
| /src/transformers/models/mctct/mod*_mctct* @eustlb | ||||
| /src/transformers/models/mimi/mod*_mimi* @eustlb | ||||
| /src/transformers/models/mms/mod*_mms* @eustlb | ||||
| /src/transformers/models/moshi/mod*_moshi* @eustlb | ||||
| /src/transformers/models/musicgen/mod*_musicgen* @eustlb | ||||
| /src/transformers/models/musicgen_melody/mod*_musicgen_melody* @eustlb | ||||
| /src/transformers/models/pop2piano/mod*_pop2piano* @eustlb | ||||
| /src/transformers/models/seamless_m4t/mod*_seamless_m4t* @eustlb | ||||
| /src/transformers/models/seamless_m4t_v2/mod*_seamless_m4t_v2* @eustlb | ||||
| /src/transformers/models/sew/mod*_sew* @eustlb | ||||
| /src/transformers/models/sew_d/mod*_sew_d* @eustlb | ||||
| /src/transformers/models/speech_to_text/mod*_speech_to_text* @eustlb | ||||
| /src/transformers/models/speech_to_text_2/mod*_speech_to_text_2* @eustlb | ||||
| /src/transformers/models/speecht5/mod*_speecht5* @eustlb | ||||
| /src/transformers/models/unispeech/mod*_unispeech* @eustlb | ||||
| /src/transformers/models/unispeech_sat/mod*_unispeech_sat* @eustlb | ||||
| /src/transformers/models/univnet/mod*_univnet* @eustlb | ||||
| /src/transformers/models/vits/mod*_vits* @eustlb | ||||
| /src/transformers/models/wav2vec2/mod*_wav2vec2* @eustlb | ||||
| /src/transformers/models/wav2vec2_bert/mod*_wav2vec2_bert* @eustlb | ||||
| /src/transformers/models/wav2vec2_conformer/mod*_wav2vec2_conformer* @eustlb | ||||
| /src/transformers/models/wav2vec2_phoneme/mod*_wav2vec2_phoneme* @eustlb | ||||
| /src/transformers/models/wavlm/mod*_wavlm* @eustlb | ||||
| /src/transformers/models/whisper/mod*_whisper* @eustlb | ||||
| /src/transformers/models/xls_r/mod*_xls_r* @eustlb | ||||
| /src/transformers/models/xlsr_wav2vec2/mod*_xlsr_wav2vec2* @eustlb | ||||
|  | ||||
| # Video models | ||||
| /src/transformers/models/timesformer/mod*_timesformer* @Rocketknight1 | ||||
| /src/transformers/models/videomae/mod*_videomae* @Rocketknight1 | ||||
| /src/transformers/models/vivit/mod*_vivit* @Rocketknight1 | ||||
|  | ||||
| # Multimodal models | ||||
| /src/transformers/models/align/mod*_align* @zucchini-nlp | ||||
| /src/transformers/models/altclip/mod*_altclip* @zucchini-nlp | ||||
| /src/transformers/models/aria/mod*_aria* @zucchini-nlp | ||||
| /src/transformers/models/blip/mod*_blip* @zucchini-nlp | ||||
| /src/transformers/models/blip_2/mod*_blip_2* @zucchini-nlp | ||||
| /src/transformers/models/bridgetower/mod*_bridgetower* @zucchini-nlp | ||||
| /src/transformers/models/bros/mod*_bros* @zucchini-nlp | ||||
| /src/transformers/models/chameleon/mod*_chameleon* @zucchini-nlp | ||||
| /src/transformers/models/chinese_clip/mod*_chinese_clip* @zucchini-nlp | ||||
| /src/transformers/models/clip/mod*_clip* @zucchini-nlp | ||||
| /src/transformers/models/clipseg/mod*_clipseg* @zucchini-nlp | ||||
| /src/transformers/models/clvp/mod*_clvp* @zucchini-nlp | ||||
| /src/transformers/models/colpali/mod*_colpali* @zucchini-nlp @yonigozlan | ||||
| /src/transformers/models/data2vec/mod*_data2vec* @zucchini-nlp | ||||
| /src/transformers/models/deplot/mod*_deplot* @zucchini-nlp | ||||
| /src/transformers/models/donut/mod*_donut* @zucchini-nlp | ||||
| /src/transformers/models/flava/mod*_flava* @zucchini-nlp | ||||
| /src/transformers/models/git/mod*_git* @zucchini-nlp | ||||
| /src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel | ||||
| /src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp | ||||
| /src/transformers/models/idefics/mod*_idefics* @zucchini-nlp | ||||
| /src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp | ||||
| /src/transformers/models/idefics3/mod*_idefics3* @zucchini-nlp | ||||
| /src/transformers/models/instructblip/mod*_instructblip* @zucchini-nlp | ||||
| /src/transformers/models/instructblipvideo/mod*_instructblipvideo* @zucchini-nlp | ||||
| /src/transformers/models/kosmos_2/mod*_kosmos_2* @zucchini-nlp | ||||
| /src/transformers/models/layoutlm/mod*_layoutlm* @NielsRogge | ||||
| /src/transformers/models/layoutlmv2/mod*_layoutlmv2* @NielsRogge | ||||
| /src/transformers/models/layoutlmv3/mod*_layoutlmv3* @NielsRogge | ||||
| /src/transformers/models/layoutxlm/mod*_layoutxlm* @NielsRogge | ||||
| /src/transformers/models/lilt/mod*_lilt* @zucchini-nlp | ||||
| /src/transformers/models/llava/mod*_llava* @zucchini-nlp @arthurzucker | ||||
| /src/transformers/models/llava_next/mod*_llava_next* @zucchini-nlp | ||||
| /src/transformers/models/llava_next_video/mod*_llava_next_video* @zucchini-nlp | ||||
| /src/transformers/models/llava_onevision/mod*_llava_onevision* @zucchini-nlp | ||||
| /src/transformers/models/lxmert/mod*_lxmert* @zucchini-nlp | ||||
| /src/transformers/models/matcha/mod*_matcha* @zucchini-nlp | ||||
| /src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp | ||||
| /src/transformers/models/mllama/mod*_mllama* @zucchini-nlp | ||||
| /src/transformers/models/nougat/mod*_nougat* @NielsRogge | ||||
| /src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan | ||||
| /src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp | ||||
| /src/transformers/models/owlvit/mod*_owlvit* @qubvel | ||||
| /src/transformers/models/owlv2/mod*_owlv2* @qubvel | ||||
| /src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap | ||||
| /src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp | ||||
| /src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp | ||||
| /src/transformers/models/pixtral/mod*_pixtral* @zucchini-nlp @ArthurZucker | ||||
| /src/transformers/models/qwen2_audio/mod*_qwen2_audio* @zucchini-nlp @ArthurZucker | ||||
| /src/transformers/models/qwen2_vl/mod*_qwen2_vl* @zucchini-nlp @ArthurZucker | ||||
| /src/transformers/models/sam/mod*_sam* @zucchini-nlp @ArthurZucker | ||||
| /src/transformers/models/siglip/mod*_siglip* @zucchini-nlp | ||||
| /src/transformers/models/speech_encoder_decoder/mod*_speech_encoder_decoder* @zucchini-nlp | ||||
| /src/transformers/models/tapas/mod*_tapas* @NielsRogge | ||||
| /src/transformers/models/trocr/mod*_trocr* @zucchini-nlp | ||||
| /src/transformers/models/tvlt/mod*_tvlt* @zucchini-nlp | ||||
| /src/transformers/models/tvp/mod*_tvp* @zucchini-nlp | ||||
| /src/transformers/models/udop/mod*_udop* @zucchini-nlp | ||||
| /src/transformers/models/video_llava/mod*_video_llava* @zucchini-nlp | ||||
| /src/transformers/models/vilt/mod*_vilt* @zucchini-nlp | ||||
| /src/transformers/models/vipllava/mod*_vipllava* @zucchini-nlp | ||||
| /src/transformers/models/vision_encoder_decoder/mod*_vision_encoder_decoder* @Rocketknight1 | ||||
| /src/transformers/models/vision_text_dual_encoder/mod*_vision_text_dual_encoder* @Rocketknight1 | ||||
| /src/transformers/models/visual_bert/mod*_visual_bert* @zucchini-nlp | ||||
| /src/transformers/models/xclip/mod*_xclip* @zucchini-nlp | ||||
|  | ||||
| # Reinforcement learning models | ||||
| /src/transformers/models/decision_transformer/mod*_decision_transformer* @Rocketknight1 | ||||
| /src/transformers/models/trajectory_transformer/mod*_trajectory_transformer* @Rocketknight1 | ||||
|  | ||||
| # Time series models | ||||
| /src/transformers/models/autoformer/mod*_autoformer* @Rocketknight1 | ||||
| /src/transformers/models/informer/mod*_informer* @Rocketknight1 | ||||
| /src/transformers/models/patchtsmixer/mod*_patchtsmixer* @Rocketknight1 | ||||
| /src/transformers/models/patchtst/mod*_patchtst* @Rocketknight1 | ||||
| /src/transformers/models/time_series_transformer/mod*_time_series_transformer* @Rocketknight1 | ||||
|  | ||||
| # Graph models | ||||
| /src/transformers/models/graphormer/mod*_graphormer* @clefourrier | ||||
|  | ||||
| # Finally, files with no owners that shouldn't generate pings, usually automatically generated and checked in the CI | ||||
| utils/dummy* | ||||
							
								
								
									
										2
									
								
								.github/workflows/add-model-like.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/add-model-like.yml
									
									
									
									
										vendored
									
									
								
							| @ -54,7 +54,7 @@ jobs: | ||||
|       - name: Create model files | ||||
|         run: | | ||||
|           . ~/venv/bin/activate | ||||
|           transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo . | ||||
|           transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo . | ||||
|           make style | ||||
|           make fix-copies | ||||
|  | ||||
|  | ||||
							
								
								
									
										26
									
								
								.github/workflows/assign-reviewers.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										26
									
								
								.github/workflows/assign-reviewers.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,26 +0,0 @@ | ||||
| name: Assign PR Reviewers | ||||
| on: | ||||
|   pull_request_target: | ||||
|     branches: | ||||
|       - main | ||||
|     types: [ready_for_review] | ||||
|  | ||||
| jobs: | ||||
|   assign_reviewers: | ||||
|     permissions: | ||||
|        pull-requests: write | ||||
|     runs-on: ubuntu-22.04 | ||||
|     steps: | ||||
|       - uses: actions/checkout@v4 | ||||
|       - name: Set up Python | ||||
|         uses: actions/setup-python@v5 | ||||
|         with: | ||||
|           python-version: '3.13' | ||||
|       - name: Install dependencies | ||||
|         run: | | ||||
|           python -m pip install --upgrade pip | ||||
|           pip install PyGithub | ||||
|       - name: Run assignment script | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|         run: python .github/scripts/assign_reviewers.py | ||||
							
								
								
									
										3
									
								
								.github/workflows/benchmark.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/benchmark.yml
									
									
									
									
										vendored
									
									
								
							| @ -64,7 +64,7 @@ jobs: | ||||
|             commit_id=$GITHUB_SHA | ||||
|           fi | ||||
|           commit_msg=$(git show -s --format=%s | cut -c1-70) | ||||
|           python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg" | ||||
|           python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" | ||||
|         env: | ||||
|           HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} | ||||
|           # Enable this to see debug logs | ||||
| @ -73,4 +73,3 @@ jobs: | ||||
|           PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }} | ||||
|           PGUSER: transformers_benchmarks | ||||
|           PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }} | ||||
|           BRANCH_NAME: ${{ github.head_ref || github.ref_name }} | ||||
|  | ||||
							
								
								
									
										42
									
								
								.github/workflows/build-docker-images.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										42
									
								
								.github/workflows/build-docker-images.yml
									
									
									
									
										vendored
									
									
								
							| @ -19,7 +19,7 @@ concurrency: | ||||
|  | ||||
| jobs: | ||||
|   latest-docker: | ||||
|     name: "Latest PyTorch [dev]" | ||||
|     name: "Latest PyTorch + TensorFlow [dev]" | ||||
|     runs-on: | ||||
|       group: aws-general-8-plus | ||||
|     steps: | ||||
| @ -70,7 +70,7 @@ jobs: | ||||
|   latest-torch-deepspeed-docker: | ||||
|     name: "Latest PyTorch + DeepSpeed" | ||||
|     runs-on: | ||||
|       group: aws-g4dn-2xlarge-cache | ||||
|       group: aws-general-8-plus | ||||
|     steps: | ||||
|       - | ||||
|         name: Set up Docker Buildx | ||||
| @ -267,6 +267,44 @@ jobs: | ||||
|           status: ${{ job.status }} | ||||
|           slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} | ||||
|  | ||||
|   latest-tensorflow: | ||||
|     name: "Latest TensorFlow [dev]" | ||||
|     # Push CI doesn't need this image | ||||
|     if: inputs.image_postfix != '-push-ci' | ||||
|     runs-on: | ||||
|       group: aws-general-8-plus | ||||
|     steps: | ||||
|       - | ||||
|         name: Set up Docker Buildx | ||||
|         uses: docker/setup-buildx-action@v3 | ||||
|       - | ||||
|         name: Check out code | ||||
|         uses: actions/checkout@v4 | ||||
|       - | ||||
|         name: Login to DockerHub | ||||
|         uses: docker/login-action@v3 | ||||
|         with: | ||||
|           username: ${{ secrets.DOCKERHUB_USERNAME }} | ||||
|           password: ${{ secrets.DOCKERHUB_PASSWORD }} | ||||
|       - | ||||
|         name: Build and push | ||||
|         uses: docker/build-push-action@v5 | ||||
|         with: | ||||
|           context: ./docker/transformers-tensorflow-gpu | ||||
|           build-args: | | ||||
|             REF=main | ||||
|           push: true | ||||
|           tags: huggingface/transformers-tensorflow-gpu | ||||
|  | ||||
|       - name: Post to Slack | ||||
|         if: always() | ||||
|         uses: huggingface/hf-workflows/.github/actions/post-slack@main | ||||
|         with: | ||||
|           slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} | ||||
|           title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build  | ||||
|           status: ${{ job.status }} | ||||
|           slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} | ||||
|  | ||||
|   latest-pytorch-deepspeed-amd: | ||||
|     name: "PyTorch + DeepSpeed (AMD) [dev]" | ||||
|     runs-on: | ||||
|  | ||||
| @ -42,7 +42,7 @@ jobs: | ||||
|   nightly-torch-deepspeed-docker: | ||||
|     name: "Nightly PyTorch + DeepSpeed" | ||||
|     runs-on: | ||||
|       group: aws-g4dn-2xlarge-cache | ||||
|       group: aws-general-8-plus | ||||
|     steps: | ||||
|       - | ||||
|         name: Set up Docker Buildx | ||||
|  | ||||
							
								
								
									
										18
									
								
								.github/workflows/build_pr_documentation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										18
									
								
								.github/workflows/build_pr_documentation.yml
									
									
									
									
										vendored
									
									
								
							| @ -2,15 +2,6 @@ name: Build PR Documentation | ||||
|  | ||||
| on: | ||||
|   pull_request: | ||||
|   workflow_call: | ||||
|     inputs: | ||||
|       pr_number: | ||||
|         type: string | ||||
|         required: true | ||||
|       commit_sha: | ||||
|         type: string | ||||
|         required: true | ||||
|  | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | ||||
| @ -18,9 +9,10 @@ concurrency: | ||||
|  | ||||
| jobs: | ||||
|   build: | ||||
|     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@6e2eb04a2604817c97be03786efa494fe3acae90 | ||||
|     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main | ||||
|     with: | ||||
|       commit_sha: ${{ inputs.commit_sha || github.event.pull_request.head.sha }} | ||||
|       pr_number: ${{ inputs.pr_number || github.event.number }} | ||||
|       commit_sha: ${{ github.event.pull_request.head.sha }} | ||||
|       pr_number: ${{ github.event.number }} | ||||
|       package: transformers | ||||
|       languages: en | ||||
|       languages: ar de en es fr hi it ko pt tr zh ja te | ||||
|       custom_container: huggingface/transformers-doc-builder | ||||
|  | ||||
| @ -9,18 +9,6 @@ on: | ||||
|       start_sha: | ||||
|         required: true | ||||
|         type: string | ||||
|       job: | ||||
|         required: true | ||||
|         type: string | ||||
|       slack_report_channel: | ||||
|         required: true | ||||
|         type: string | ||||
|       ci_event: | ||||
|         required: true | ||||
|         type: string | ||||
|       report_repo_id: | ||||
|         required: true | ||||
|         type: string | ||||
| 
 | ||||
| 
 | ||||
| env: | ||||
| @ -38,128 +26,77 @@ env: | ||||
| 
 | ||||
| 
 | ||||
| jobs: | ||||
|   check_new_failures: | ||||
|   run_models_gpu: | ||||
|     name: " " | ||||
|     runs-on: | ||||
|       group: aws-g4dn-4xlarge-cache | ||||
|       group: aws-g4dn-2xlarge-cache | ||||
|     container: | ||||
|       image: ${{ inputs.docker }} | ||||
|       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
|     steps: | ||||
|       - uses: actions/download-artifact@v4 | ||||
|         with: | ||||
|           name: ci_results_${{ inputs.job }} | ||||
|           path: /transformers/ci_results_${{ inputs.job }} | ||||
| 
 | ||||
|       - name: Check file | ||||
|         working-directory: /transformers | ||||
|         run: | | ||||
|           if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then | ||||
|             echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..." | ||||
|             echo "process=true" >> $GITHUB_ENV | ||||
|           else | ||||
|             echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort." | ||||
|             echo "process=false" >> $GITHUB_ENV | ||||
|           fi | ||||
| 
 | ||||
|       - uses: actions/download-artifact@v4 | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         with: | ||||
|           pattern: setup_values* | ||||
|           path: setup_values | ||||
|           merge-multiple: true | ||||
| 
 | ||||
|       - name: Prepare some setup values | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: | | ||||
|           if [ -f setup_values/prev_workflow_run_id.txt ]; then | ||||
|             echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV | ||||
|           else | ||||
|             echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV | ||||
|           fi | ||||
| 
 | ||||
|           if [ -f setup_values/other_workflow_run_id.txt ]; then | ||||
|             echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV | ||||
|           else | ||||
|             echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV | ||||
|           fi | ||||
|           name: ci_results_run_models_gpu | ||||
|           path: /transformers/ci_results_run_models_gpu | ||||
| 
 | ||||
|       - name: Update clone | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: git fetch && git checkout ${{ github.sha }} | ||||
| 
 | ||||
|       - name: Get target commit | ||||
|         working-directory: /transformers/utils | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: | | ||||
|           echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV | ||||
|           echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"]); print(commit)')" >> $GITHUB_ENV | ||||
| 
 | ||||
|       - name: Checkout to `start_sha` | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: git fetch && git checkout ${{ inputs.start_sha }} | ||||
| 
 | ||||
|       - name: Reinstall transformers in edit mode (remove the one installed during docker image build) | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | ||||
| 
 | ||||
|       - name: NVIDIA-SMI | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: | | ||||
|           nvidia-smi | ||||
| 
 | ||||
|       - name: Environment | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: | | ||||
|           python3 utils/print_env.py | ||||
| 
 | ||||
|       - name: Show installed libraries and their versions | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: pip freeze | ||||
| 
 | ||||
|       - name: Check failed tests | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json | ||||
|         run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json | ||||
| 
 | ||||
|       - name: Show results | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: | | ||||
|           ls -l new_failures_with_bad_commit.json | ||||
|           cat new_failures_with_bad_commit.json | ||||
|           ls -l new_model_failures_with_bad_commit.json | ||||
|           cat new_model_failures_with_bad_commit.json | ||||
| 
 | ||||
|       - name: Checkout back | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: | | ||||
|           git checkout ${{ inputs.start_sha }} | ||||
| 
 | ||||
|       - name: Process report | ||||
|         shell: bash | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         env: | ||||
|           ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | ||||
|           TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} | ||||
|           JOB_NAME: ${{ inputs.job }} | ||||
|           REPORT_REPO_ID: ${{ inputs.report_repo_id }} | ||||
|         run: | | ||||
|           python3 utils/process_bad_commit_report.py | ||||
| 
 | ||||
|       - name: Process report | ||||
|         shell: bash | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         env: | ||||
|           ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | ||||
|           TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} | ||||
|           JOB_NAME: ${{ inputs.job }} | ||||
|           REPORT_REPO_ID: ${{ inputs.report_repo_id }} | ||||
|         run: | | ||||
|           { | ||||
|             echo 'REPORT_TEXT<<EOF' | ||||
| @ -167,31 +104,17 @@ jobs: | ||||
|             echo EOF | ||||
|           } >> "$GITHUB_ENV" | ||||
| 
 | ||||
|       - name: Prepare Slack report title | ||||
|         working-directory: /transformers | ||||
|         if: ${{ env.process == 'true' }} | ||||
|         run: | | ||||
|           pip install slack_sdk | ||||
|           echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV | ||||
| 
 | ||||
|       - name: Send processed report | ||||
|         if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }} | ||||
|         if: ${{ !endsWith(env.REPORT_TEXT, '{}') }} | ||||
|         uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 | ||||
|         with: | ||||
|           # Slack channel id, channel name, or user id to post message. | ||||
|           # See also: https://api.slack.com/methods/chat.postMessage#channels | ||||
|           channel-id: '#${{ inputs.slack_report_channel }}' | ||||
|           channel-id: '#transformers-ci-feedback-tests' | ||||
|           # For posting a rich message using Block Kit | ||||
|           payload: | | ||||
|             { | ||||
|               "blocks": [ | ||||
|                 { | ||||
|                   "type": "header", | ||||
|                   "text": { | ||||
|                     "type": "plain_text", | ||||
|                     "text": "${{ env.title }}" | ||||
|                   } | ||||
|                 }, | ||||
|                 { | ||||
|                   "type": "section", | ||||
|                   "text": { | ||||
							
								
								
									
										2
									
								
								.github/workflows/doctest_job.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/doctest_job.yml
									
									
									
									
										vendored
									
									
								
							| @ -28,7 +28,7 @@ jobs: | ||||
|       matrix: | ||||
|         split_keys: ${{ fromJson(inputs.split_keys) }} | ||||
|     runs-on:  | ||||
|       group: aws-g4dn-4xlarge-cache | ||||
|       group: aws-g4dn-2xlarge-cache | ||||
|     container: | ||||
|       image: huggingface/transformers-all-latest-gpu | ||||
|       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/doctests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/doctests.yml
									
									
									
									
										vendored
									
									
								
							| @ -15,7 +15,7 @@ jobs: | ||||
|   setup: | ||||
|     name: Setup | ||||
|     runs-on:  | ||||
|       group: aws-g4dn-4xlarge-cache | ||||
|       group: aws-g4dn-2xlarge-cache | ||||
|     container: | ||||
|       image: huggingface/transformers-all-latest-gpu | ||||
|       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
|  | ||||
							
								
								
									
										22
									
								
								.github/workflows/model_jobs.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								.github/workflows/model_jobs.yml
									
									
									
									
										vendored
									
									
								
							| @ -18,10 +18,6 @@ on: | ||||
|       docker: | ||||
|         required: true | ||||
|         type: string | ||||
|       report_name_prefix: | ||||
|         required: false | ||||
|         default: run_models_gpu | ||||
|         type: string | ||||
|  | ||||
| env: | ||||
|   HF_HOME: /mnt/cache | ||||
| @ -107,7 +103,7 @@ jobs: | ||||
|         run: | | ||||
|           echo "${{ inputs.machine_type }}" | ||||
|  | ||||
|           if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then | ||||
|           if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
| @ -120,23 +116,23 @@ jobs: | ||||
|  | ||||
|       - name: Run all tests on GPU | ||||
|         working-directory: /transformers | ||||
|         run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} | ||||
|         run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} | ||||
|  | ||||
|       - name: Failure short reports | ||||
|         if: ${{ failure() }} | ||||
|         continue-on-error: true | ||||
|         run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt | ||||
|         run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt | ||||
|  | ||||
|       - name: Run test | ||||
|         shell: bash | ||||
|         run: | | ||||
|           mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports | ||||
|           echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt | ||||
|           echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports" | ||||
|           mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports | ||||
|           echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt | ||||
|           echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" | ||||
|  | ||||
|       - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports" | ||||
|       - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" | ||||
|         if: ${{ always() }} | ||||
|         uses: actions/upload-artifact@v4 | ||||
|         with: | ||||
|           name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports | ||||
|           path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports | ||||
|           name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports | ||||
|           path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports | ||||
|  | ||||
| @ -1,68 +0,0 @@ | ||||
| # Used to notify core maintainers about new model PR being merged | ||||
| name: New model PR merged notification | ||||
|  | ||||
| on: | ||||
|   push: | ||||
|     branches: | ||||
|       - main | ||||
|     paths: | ||||
|       - 'src/transformers/models/*/modeling_*' | ||||
|  | ||||
| jobs: | ||||
|   notify_new_model: | ||||
|     name: Notify new model | ||||
|     runs-on: ubuntu-22.04 | ||||
|     steps: | ||||
|       - uses: actions/checkout@v4 | ||||
|         with: | ||||
|           fetch-depth: 0 | ||||
|       - name: Check new model | ||||
|         shell: bash | ||||
|         run: | | ||||
|           python -m pip install gitpython | ||||
|           python -c 'from utils.pr_slow_ci_models import get_new_model; new_model = get_new_model(diff_with_last_commit=True); print(new_model)' | tee output.txt | ||||
|           echo "NEW_MODEL=$(tail -n 1 output.txt)" >> $GITHUB_ENV | ||||
|           echo "COMMIT_SHA=$(git log -1 --format=%H)" >> $GITHUB_ENV | ||||
|  | ||||
|       - name: print commit sha | ||||
|         if: ${{ env.NEW_MODEL != ''}} | ||||
|         shell: bash | ||||
|         run: | | ||||
|           echo "$COMMIT_SHA" | ||||
|  | ||||
|       - name: print new model | ||||
|         if: ${{ env.NEW_MODEL != ''}} | ||||
|         shell: bash | ||||
|         run: | | ||||
|           echo "$NEW_MODEL" | ||||
|  | ||||
|       - name: Notify | ||||
|         if: ${{ env.NEW_MODEL != ''}} | ||||
|         uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 | ||||
|         with: | ||||
|           # Slack channel id, channel name, or user id to post message. | ||||
|           # See also: https://api.slack.com/methods/chat.postMessage#channels | ||||
|           channel-id: transformers-new-model-notification | ||||
|           # For posting a rich message using Block Kit | ||||
|           payload: | | ||||
|             { | ||||
|               "blocks": [ | ||||
|                 { | ||||
|                   "type": "header", | ||||
|                   "text": { | ||||
|                     "type": "plain_text", | ||||
|                     "text": "New model!", | ||||
|                     "emoji": true | ||||
|                   } | ||||
|                 }, | ||||
|                 { | ||||
|                   "type": "section", | ||||
|                   "text": { | ||||
|                     "type": "mrkdwn", | ||||
|                     "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}" | ||||
|                   } | ||||
|                 } | ||||
|               ] | ||||
|             } | ||||
|         env: | ||||
|           SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} | ||||
							
								
								
									
										34
									
								
								.github/workflows/pr-style-bot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										34
									
								
								.github/workflows/pr-style-bot.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,34 +0,0 @@ | ||||
| # To run this bot, comment "@bot /style" on a PR | ||||
| name: Style Bot | ||||
|  | ||||
| on: | ||||
|   issue_comment: | ||||
|     types: [created] | ||||
|  | ||||
| permissions: | ||||
|   contents: write | ||||
|   pull-requests: write | ||||
|  | ||||
| jobs: | ||||
|   style: | ||||
|     uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@639ee721e149a281fe726a50a2cc1354b48bc463 | ||||
|     with: | ||||
|       python_quality_dependencies: "[quality]" | ||||
|       style_command_type: "default" | ||||
|     secrets: | ||||
|       bot_token: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
|   check-outputs: | ||||
|     runs-on: ubuntu-latest | ||||
|     needs: style | ||||
|     steps: | ||||
|       - run: echo ${{ needs.style.outputs.pr_number }} | ||||
|       - run: echo ${{ needs.style.outputs.new_commit_sha }} | ||||
|  | ||||
|   trigger: | ||||
|     needs: style | ||||
|     if: needs.style.outputs.new_commit_sha != '' | ||||
|     uses: "./.github/workflows/build_pr_documentation.yml" | ||||
|     with: | ||||
|       pr_number: ${{ needs.style.outputs.pr_number }} | ||||
|       commit_sha: ${{ needs.style.outputs.new_commit_sha }} | ||||
							
								
								
									
										2
									
								
								.github/workflows/push-important-models.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/push-important-models.yml
									
									
									
									
										vendored
									
									
								
							| @ -27,7 +27,7 @@ jobs: | ||||
|  | ||||
|       - name: Get changed files | ||||
|         id: changed-files | ||||
|         uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c | ||||
|         uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42 | ||||
|         with: | ||||
|           files: src/transformers/models/** | ||||
|  | ||||
|  | ||||
							
								
								
									
										12
									
								
								.github/workflows/self-comment-ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								.github/workflows/self-comment-ci.yml
									
									
									
									
										vendored
									
									
								
							| @ -29,7 +29,7 @@ jobs: | ||||
|     runs-on: ubuntu-22.04 | ||||
|     name: Get PR number | ||||
|     # For security: only allow team members to run | ||||
|     if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} | ||||
|     if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} | ||||
|     outputs: | ||||
|       PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }} | ||||
|     steps: | ||||
| @ -145,7 +145,7 @@ jobs: | ||||
|         env: | ||||
|           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|           MODELS: ${{ needs.get-tests.outputs.models }} | ||||
|           BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}" | ||||
|           BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}" | ||||
|         run: | | ||||
|           gh api \ | ||||
|             --method POST \ | ||||
| @ -185,7 +185,7 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         folders: ${{ fromJson(needs.get-tests.outputs.models) }} | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|     runs-on: | ||||
|        group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
| @ -239,7 +239,7 @@ jobs: | ||||
|         shell: bash | ||||
|         run: | | ||||
|           echo "${{ matrix.machine_type }}" | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
| @ -292,7 +292,7 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }} | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|     runs-on: | ||||
|       group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
| @ -338,7 +338,7 @@ jobs: | ||||
|         shell: bash | ||||
|         run: | | ||||
|           echo "${{ matrix.machine_type }}" | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/self-push-caller.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/self-push-caller.yml
									
									
									
									
										vendored
									
									
								
							| @ -25,7 +25,7 @@ jobs: | ||||
|          | ||||
|         - name: Get changed files | ||||
|           id: changed-files | ||||
|           uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c | ||||
|           uses: tj-actions/changed-files@v41 | ||||
|          | ||||
|         - name: Was setup changed  | ||||
|           id: was_changed | ||||
|  | ||||
							
								
								
									
										55
									
								
								.github/workflows/self-scheduled-amd-mi210-caller.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								.github/workflows/self-scheduled-amd-mi210-caller.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,55 @@ | ||||
| name: Self-hosted runner (AMD mi210 scheduled CI caller) | ||||
|  | ||||
| on: | ||||
|   workflow_run: | ||||
|     workflows: ["Self-hosted runner (AMD scheduled CI caller)"] | ||||
|     branches: ["main"] | ||||
|     types: [completed] | ||||
|   push: | ||||
|     branches: | ||||
|       - run_amd_scheduled_ci_caller* | ||||
|  | ||||
| jobs: | ||||
|   model-ci: | ||||
|     name: Model CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main | ||||
|     with: | ||||
|       job: run_models_gpu | ||||
|       slack_report_channel: "#transformers-ci-daily-amd" | ||||
|       runner: mi210 | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi210 | ||||
|     secrets: inherit | ||||
|  | ||||
|   torch-pipeline: | ||||
|     name: Torch pipeline CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main | ||||
|     with: | ||||
|       job: run_pipelines_torch_gpu | ||||
|       slack_report_channel: "#transformers-ci-daily-amd" | ||||
|       runner: mi210 | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi210 | ||||
|     secrets: inherit | ||||
|  | ||||
|   example-ci: | ||||
|     name: Example CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main | ||||
|     with: | ||||
|       job: run_examples_gpu | ||||
|       slack_report_channel: "#transformers-ci-daily-amd" | ||||
|       runner: mi210 | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi210 | ||||
|     secrets: inherit | ||||
|  | ||||
|   deepspeed-ci: | ||||
|     name: DeepSpeed CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main | ||||
|     with: | ||||
|       job: run_torch_cuda_extensions_gpu | ||||
|       slack_report_channel: "#transformers-ci-daily-amd" | ||||
|       runner: mi210 | ||||
|       docker: huggingface/transformers-pytorch-deepspeed-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi210 | ||||
|     secrets: inherit | ||||
| @ -19,7 +19,6 @@ jobs: | ||||
|       runner: mi250 | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi250 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   torch-pipeline: | ||||
| @ -31,7 +30,6 @@ jobs: | ||||
|       runner: mi250 | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi250 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   example-ci: | ||||
| @ -43,7 +41,6 @@ jobs: | ||||
|       runner: mi250 | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi250 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   deepspeed-ci: | ||||
| @ -55,5 +52,4 @@ jobs: | ||||
|       runner: mi250 | ||||
|       docker: huggingface/transformers-pytorch-deepspeed-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi250 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
| @ -1,63 +0,0 @@ | ||||
| name: Self-hosted runner scale set (AMD mi300 scheduled CI caller) | ||||
|  | ||||
| # Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml | ||||
| # For example, 1gpu scale set: amd-mi300-ci-1gpu | ||||
| #              2gpu scale set: amd-mi300-ci-2gpu | ||||
|  | ||||
| on: | ||||
|   workflow_run: | ||||
|     workflows: ["Self-hosted runner (AMD scheduled CI caller)"] | ||||
|     branches: ["main"] | ||||
|     types: [completed] | ||||
|   push: | ||||
|     branches: | ||||
|       - run_amd_scheduled_ci_caller* | ||||
|  | ||||
| jobs: | ||||
|   model-ci: | ||||
|     name: Model CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main | ||||
|     with: | ||||
|       job: run_models_gpu | ||||
|       slack_report_channel: "#amd-hf-ci" | ||||
|       runner_scale_set: amd-mi300-ci | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi300 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   torch-pipeline: | ||||
|     name: Torch pipeline CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main | ||||
|     with: | ||||
|       job: run_pipelines_torch_gpu | ||||
|       slack_report_channel: "#amd-hf-ci" | ||||
|       runner_scale_set: amd-mi300-ci | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi300 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   example-ci: | ||||
|     name: Example CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main | ||||
|     with: | ||||
|       job: run_examples_gpu | ||||
|       slack_report_channel: "#amd-hf-ci" | ||||
|       runner_scale_set: amd-mi300-ci | ||||
|       docker: huggingface/transformers-pytorch-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi300 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   deepspeed-ci: | ||||
|     name: DeepSpeed CI | ||||
|     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main | ||||
|     with: | ||||
|       job: run_torch_cuda_extensions_gpu | ||||
|       slack_report_channel: "#amd-hf-ci" | ||||
|       runner_scale_set: amd-mi300-ci | ||||
|       docker: huggingface/transformers-pytorch-deepspeed-amd-gpu | ||||
|       ci_event: Scheduled CI (AMD) - mi300 | ||||
|       report_repo_id: optimum-amd/transformers_daily_ci | ||||
|     secrets: inherit | ||||
							
								
								
									
										65
									
								
								.github/workflows/self-scheduled-caller.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										65
									
								
								.github/workflows/self-scheduled-caller.yml
									
									
									
									
										vendored
									
									
								
							| @ -8,43 +8,8 @@ on: | ||||
|   push: | ||||
|     branches: | ||||
|       - run_scheduled_ci* | ||||
|   workflow_dispatch: | ||||
|     inputs: | ||||
|       prev_workflow_run_id: | ||||
|         description: 'previous workflow run id to compare' | ||||
|         type: string | ||||
|         required: false | ||||
|         default: "" | ||||
|       other_workflow_run_id: | ||||
|         description: 'other workflow run id to compare' | ||||
|         type: string | ||||
|         required: false | ||||
|         default: "" | ||||
|  | ||||
|  | ||||
| # Used for `push` to easily modiffy the target workflow runs to compare against | ||||
| env: | ||||
|     prev_workflow_run_id: "" | ||||
|     other_workflow_run_id: "" | ||||
|  | ||||
|  | ||||
| jobs: | ||||
|   setup: | ||||
|     name: Setup | ||||
|     runs-on: ubuntu-22.04 | ||||
|     steps: | ||||
|       - name: Setup | ||||
|         run: | | ||||
|           mkdir "setup_values" | ||||
|           echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt" | ||||
|           echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt" | ||||
|  | ||||
|       - name: Upload artifacts | ||||
|         uses: actions/upload-artifact@v4 | ||||
|         with: | ||||
|           name: setup_values | ||||
|           path: setup_values | ||||
|  | ||||
|   model-ci: | ||||
|     name: Model CI | ||||
|     uses: ./.github/workflows/self-scheduled.yml | ||||
| @ -54,7 +19,6 @@ jobs: | ||||
|       runner: daily-ci | ||||
|       docker: huggingface/transformers-all-latest-gpu | ||||
|       ci_event: Daily CI | ||||
|       report_repo_id: hf-internal-testing/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   torch-pipeline: | ||||
| @ -66,7 +30,17 @@ jobs: | ||||
|       runner: daily-ci | ||||
|       docker: huggingface/transformers-pytorch-gpu | ||||
|       ci_event: Daily CI | ||||
|       report_repo_id: hf-internal-testing/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   tf-pipeline: | ||||
|     name: TF pipeline CI | ||||
|     uses: ./.github/workflows/self-scheduled.yml | ||||
|     with: | ||||
|       job: run_pipelines_tf_gpu | ||||
|       slack_report_channel: "#transformers-ci-daily-pipeline-tf" | ||||
|       runner: daily-ci | ||||
|       docker: huggingface/transformers-tensorflow-gpu | ||||
|       ci_event: Daily CI | ||||
|     secrets: inherit | ||||
|  | ||||
|   example-ci: | ||||
| @ -78,19 +52,6 @@ jobs: | ||||
|       runner: daily-ci | ||||
|       docker: huggingface/transformers-all-latest-gpu | ||||
|       ci_event: Daily CI | ||||
|       report_repo_id: hf-internal-testing/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   trainer-fsdp-ci: | ||||
|     name: Trainer/FSDP CI | ||||
|     uses: ./.github/workflows/self-scheduled.yml | ||||
|     with: | ||||
|       job: run_trainer_and_fsdp_gpu | ||||
|       slack_report_channel: "#transformers-ci-daily-training" | ||||
|       runner: daily-ci | ||||
|       docker: huggingface/transformers-all-latest-gpu | ||||
|       ci_event: Daily CI | ||||
|       report_repo_id: hf-internal-testing/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   deepspeed-ci: | ||||
| @ -98,12 +59,11 @@ jobs: | ||||
|     uses: ./.github/workflows/self-scheduled.yml | ||||
|     with: | ||||
|       job: run_torch_cuda_extensions_gpu | ||||
|       slack_report_channel: "#transformers-ci-daily-training" | ||||
|       slack_report_channel: "#transformers-ci-daily-deepspeed" | ||||
|       runner: daily-ci | ||||
|       docker: huggingface/transformers-pytorch-deepspeed-latest-gpu | ||||
|       ci_event: Daily CI | ||||
|       working-directory-prefix: /workspace | ||||
|       report_repo_id: hf-internal-testing/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
|   quantization-ci: | ||||
| @ -115,5 +75,4 @@ jobs: | ||||
|       runner: daily-ci | ||||
|       docker: huggingface/transformers-quantization-latest-gpu | ||||
|       ci_event: Daily CI | ||||
|       report_repo_id: hf-internal-testing/transformers_daily_ci | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										139
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										139
									
								
								.github/workflows/self-scheduled.yml
									
									
									
									
										vendored
									
									
								
							| @ -28,10 +28,6 @@ on: | ||||
|         default: '' | ||||
|         required: false | ||||
|         type: string | ||||
|       report_repo_id: | ||||
|         required: true | ||||
|         type: string | ||||
|  | ||||
|  | ||||
| env: | ||||
|   HF_HOME: /mnt/cache | ||||
| @ -49,11 +45,11 @@ env: | ||||
|  | ||||
| jobs: | ||||
|   setup: | ||||
|     if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job) | ||||
|     if: contains(fromJSON('["run_models_gpu", "run_quantization_torch_gpu"]'), inputs.job) | ||||
|     name: Setup | ||||
|     strategy: | ||||
|       matrix: | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|     runs-on: | ||||
|       group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
| @ -81,17 +77,12 @@ jobs: | ||||
|         run: pip freeze | ||||
|  | ||||
|       - id: set-matrix | ||||
|         if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job) | ||||
|         if: ${{ inputs.job == 'run_models_gpu' }} | ||||
|         name: Identify models to test | ||||
|         working-directory: /transformers/tests | ||||
|         run: | | ||||
|           if [ "${{ inputs.job }}" = "run_models_gpu" ]; then | ||||
|           echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT | ||||
|           echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT | ||||
|           elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then | ||||
|             echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT | ||||
|             echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT | ||||
|           fi | ||||
|  | ||||
|       - id: set-matrix-quantization | ||||
|         if: ${{ inputs.job == 'run_quantization_torch_gpu' }} | ||||
| @ -111,7 +102,7 @@ jobs: | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} | ||||
|     uses: ./.github/workflows/model_jobs.yml | ||||
|     with: | ||||
| @ -122,32 +113,13 @@ jobs: | ||||
|       docker: ${{ inputs.docker }} | ||||
|     secrets: inherit | ||||
|  | ||||
|   run_trainer_and_fsdp_gpu: | ||||
|     if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }} | ||||
|     name: " " | ||||
|     needs: setup | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         slice_id: [0, 1] | ||||
|     uses: ./.github/workflows/model_jobs.yml | ||||
|     with: | ||||
|       folder_slices: ${{ needs.setup.outputs.folder_slices }} | ||||
|       machine_type: ${{ matrix.machine_type }} | ||||
|       slice_id: ${{ matrix.slice_id }} | ||||
|       runner: ${{ inputs.runner }} | ||||
|       docker: ${{ inputs.docker }} | ||||
|       report_name_prefix: run_trainer_and_fsdp_gpu | ||||
|     secrets: inherit | ||||
|  | ||||
|   run_pipelines_torch_gpu: | ||||
|     if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} | ||||
|     name: PyTorch pipelines | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|     runs-on: | ||||
|       group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
| @ -181,7 +153,7 @@ jobs: | ||||
|         run: | | ||||
|           echo "${{ matrix.machine_type }}" | ||||
|  | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
| @ -209,13 +181,82 @@ jobs: | ||||
|           name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports | ||||
|           path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports | ||||
|  | ||||
|   run_pipelines_tf_gpu: | ||||
|     if: ${{ inputs.job == 'run_pipelines_tf_gpu' }} | ||||
|     name: TensorFlow pipelines | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|     runs-on: | ||||
|       group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
|       image: huggingface/transformers-tensorflow-gpu | ||||
|       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||||
|     steps: | ||||
|       - name: Update clone | ||||
|         working-directory: /transformers | ||||
|         run: | | ||||
|           git fetch && git checkout ${{ github.sha }} | ||||
|  | ||||
|       - name: Reinstall transformers in edit mode (remove the one installed during docker image build) | ||||
|         working-directory: /transformers | ||||
|         run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | ||||
|  | ||||
|       - name: NVIDIA-SMI | ||||
|         run: | | ||||
|           nvidia-smi | ||||
|  | ||||
|       - name: Environment | ||||
|         working-directory: /transformers | ||||
|         run: | | ||||
|           python3 utils/print_env.py | ||||
|  | ||||
|       - name: Show installed libraries and their versions | ||||
|         working-directory: /transformers | ||||
|         run: pip freeze | ||||
|  | ||||
|       - name: Set `machine_type` for report and artifact names | ||||
|         working-directory: /transformers | ||||
|         shell: bash | ||||
|         run: | | ||||
|           echo "${{ matrix.machine_type }}" | ||||
|  | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
|           else | ||||
|             machine_type=${{ matrix.machine_type }} | ||||
|           fi | ||||
|  | ||||
|           echo "$machine_type" | ||||
|           echo "machine_type=$machine_type" >> $GITHUB_ENV | ||||
|  | ||||
|       - name: Run all pipeline tests on GPU | ||||
|         working-directory: /transformers | ||||
|         run: | | ||||
|           python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines | ||||
|  | ||||
|       - name: Failure short reports | ||||
|         if: ${{ always() }} | ||||
|         run: | | ||||
|           cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt | ||||
|  | ||||
|       - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports" | ||||
|         if: ${{ always() }} | ||||
|         uses: actions/upload-artifact@v4 | ||||
|         with: | ||||
|           name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports | ||||
|           path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports | ||||
|  | ||||
|   run_examples_gpu: | ||||
|     if: ${{ inputs.job == 'run_examples_gpu' }} | ||||
|     name: Examples directory | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [aws-g4dn-4xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache] | ||||
|     runs-on: | ||||
|       group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
| @ -249,7 +290,7 @@ jobs: | ||||
|         run: | | ||||
|           echo "${{ matrix.machine_type }}" | ||||
|  | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
| @ -284,7 +325,7 @@ jobs: | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|     runs-on: | ||||
|       group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
| @ -341,12 +382,12 @@ jobs: | ||||
|         run: pip freeze | ||||
|  | ||||
|       - name: Set `machine_type` for report and artifact names | ||||
|         working-directory: ${{ inputs.working-directory-prefix }}/transformers | ||||
|         working-directory: /transformers | ||||
|         shell: bash | ||||
|         run: | | ||||
|           echo "${{ matrix.machine_type }}" | ||||
|  | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
| @ -383,7 +424,7 @@ jobs: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }} | ||||
|         machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|         machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] | ||||
|     runs-on: | ||||
|       group: '${{ matrix.machine_type }}' | ||||
|     container: | ||||
| @ -426,7 +467,7 @@ jobs: | ||||
|         run: | | ||||
|           echo "${{ matrix.machine_type }}" | ||||
|  | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then | ||||
|           if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then | ||||
|             machine_type=single-gpu | ||||
|           elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then | ||||
|             machine_type=multi-gpu | ||||
| @ -500,8 +541,8 @@ jobs: | ||||
|     needs: [ | ||||
|       setup, | ||||
|       run_models_gpu, | ||||
|       run_trainer_and_fsdp_gpu, | ||||
|       run_pipelines_torch_gpu, | ||||
|       run_pipelines_tf_gpu, | ||||
|       run_examples_gpu, | ||||
|       run_torch_cuda_extensions_gpu, | ||||
|       run_quantization_torch_gpu, | ||||
| @ -518,21 +559,15 @@ jobs: | ||||
|       folder_slices: ${{ needs.setup.outputs.folder_slices }} | ||||
|       quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} | ||||
|       ci_event: ${{ inputs.ci_event }} | ||||
|       report_repo_id: ${{ inputs.report_repo_id }} | ||||
|  | ||||
|     secrets: inherit | ||||
|  | ||||
|   check_new_failures: | ||||
|     if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }} | ||||
|     name: Check new failures | ||||
|   check_new_model_failures: | ||||
|     if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }} | ||||
|     name: Check new model failures | ||||
|     needs: send_results | ||||
|     uses: ./.github/workflows/check_failed_tests.yml | ||||
|     uses: ./.github/workflows/check_failed_model_tests.yml | ||||
|     with: | ||||
|       docker: ${{ inputs.docker }} | ||||
|       start_sha: ${{ github.sha }} | ||||
|       job: ${{ inputs.job }} | ||||
|       slack_report_channel: ${{ inputs.slack_report_channel }} | ||||
|       ci_event: ${{ inputs.ci_event }} | ||||
|       report_repo_id: ${{ inputs.report_repo_id }} | ||||
|  | ||||
|     secrets: inherit | ||||
|  | ||||
							
								
								
									
										56
									
								
								.github/workflows/slack-report.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										56
									
								
								.github/workflows/slack-report.yml
									
									
									
									
										vendored
									
									
								
							| @ -21,9 +21,6 @@ on: | ||||
|       ci_event: | ||||
|         required: true | ||||
|         type: string | ||||
|       report_repo_id: | ||||
|         required: true | ||||
|         type: string | ||||
|  | ||||
| env: | ||||
|   TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} | ||||
| @ -42,23 +39,8 @@ jobs: | ||||
|  | ||||
|       - uses: actions/checkout@v4 | ||||
|       - uses: actions/download-artifact@v4 | ||||
|  | ||||
|       - name: Prepare some setup values | ||||
|         run: | | ||||
|           if [ -f setup_values/prev_workflow_run_id.txt ]; then | ||||
|             echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV | ||||
|           else | ||||
|             echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV | ||||
|           fi | ||||
|  | ||||
|           if [ -f setup_values/other_workflow_run_id.txt ]; then | ||||
|             echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV | ||||
|           else | ||||
|             echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV | ||||
|           fi | ||||
|  | ||||
|       - name: Send message to Slack | ||||
|         shell: bash | ||||
|         if: ${{ inputs.job != 'run_quantization_torch_gpu' }} | ||||
|         env: | ||||
|           CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} | ||||
|           CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} | ||||
| @ -68,22 +50,19 @@ jobs: | ||||
|           ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | ||||
|           CI_EVENT: ${{ inputs.ci_event }} | ||||
|           CI_SHA: ${{ github.sha }} | ||||
|           CI_WORKFLOW_REF: ${{ github.workflow_ref }} | ||||
|           CI_TEST_JOB: ${{ inputs.job }} | ||||
|           SETUP_STATUS: ${{ inputs.setup_status }} | ||||
|           REPORT_REPO_ID: ${{ inputs.report_repo_id }} | ||||
|         # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change | ||||
|         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. | ||||
|         # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an | ||||
|         # empty string, and the called script still get one argument (which is the emtpy string). | ||||
|         run: | | ||||
|           sudo apt-get install -y curl | ||||
|           pip install huggingface_hub | ||||
|           pip install slack_sdk | ||||
|           pip show slack_sdk | ||||
|           if [ "${{ inputs.quantization_matrix }}" != "" ]; then | ||||
|             python utils/notification_service.py "${{ inputs.quantization_matrix }}" | ||||
|           else | ||||
|           python utils/notification_service.py "${{ inputs.folder_slices }}" | ||||
|           fi           | ||||
|  | ||||
|       # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. | ||||
|       - name: Failure table artifacts | ||||
| @ -91,3 +70,32 @@ jobs: | ||||
|         with: | ||||
|           name: ci_results_${{ inputs.job }} | ||||
|           path: ci_results_${{ inputs.job }} | ||||
|  | ||||
|       - uses: actions/checkout@v4 | ||||
|       - uses: actions/download-artifact@v4 | ||||
|       - name: Send message to Slack for quantization workflow | ||||
|         if: ${{ inputs.job == 'run_quantization_torch_gpu' }} | ||||
|         env: | ||||
|           CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} | ||||
|           ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | ||||
|           SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} | ||||
|           CI_EVENT: ${{ inputs.ci_event }} | ||||
|           CI_SHA: ${{ github.sha }} | ||||
|           CI_TEST_JOB: ${{ inputs.job }} | ||||
|           SETUP_STATUS: ${{ inputs.setup_status }} | ||||
|         # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change | ||||
|         # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. | ||||
|         run: | | ||||
|           sudo apt-get install -y curl | ||||
|           pip install huggingface_hub | ||||
|           pip install slack_sdk | ||||
|           pip show slack_sdk | ||||
|           python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" | ||||
|  | ||||
|       # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. | ||||
|       - name: Failure table artifacts | ||||
|         if: ${{ inputs.job == 'run_quantization_torch_gpu' }} | ||||
|         uses: actions/upload-artifact@v4 | ||||
|         with: | ||||
|           name: ci_results_${{ inputs.job }} | ||||
|           path: ci_results_${{ inputs.job }} | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/ssh-runner.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/ssh-runner.yml
									
									
									
									
										vendored
									
									
								
							| @ -35,7 +35,7 @@ jobs: | ||||
|         shell: bash | ||||
|         run: | | ||||
|           if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then | ||||
|             echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV | ||||
|             echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV | ||||
|           elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then | ||||
|             echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV | ||||
|           elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/update_metdata.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/update_metdata.yml
									
									
									
									
										vendored
									
									
								
							| @ -19,7 +19,7 @@ jobs: | ||||
|       - name: Setup environment | ||||
|         run: | | ||||
|           pip install --upgrade pip | ||||
|           pip install datasets pandas | ||||
|           pip install datasets pandas==2.0.3 | ||||
|           pip install .[torch,tf,flax] | ||||
|  | ||||
|       - name: Update metadata | ||||
|  | ||||
| @ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f | ||||
| To get the OS and software versions automatically, run the following command: | ||||
|  | ||||
| ```bash | ||||
| transformers env | ||||
| transformers-cli env | ||||
| ``` | ||||
|  | ||||
| You can also run the same command from the root of the repository: | ||||
| @ -221,10 +221,10 @@ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main | ||||
|    [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide. | ||||
|  | ||||
|    If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check | ||||
|    make sure you install the [documentation builder](https://github.com/huggingface/doc-builder). | ||||
|    make sure you install the documentation builder: | ||||
|  | ||||
|    ```bash | ||||
|    pip install hf-doc-builder | ||||
|    pip install ".[docs]" | ||||
|    ``` | ||||
|  | ||||
|    Run the following command from the root of the repository: | ||||
|  | ||||
| @ -26,7 +26,7 @@ There are two main venues to receive support: [the forums](https://discuss.huggi | ||||
|  | ||||
| [The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed. | ||||
|  | ||||
| If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystallized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues). | ||||
| If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues). | ||||
|  | ||||
| In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions: | ||||
|  | ||||
| @ -263,9 +263,9 @@ You are not required to read the following guidelines before opening an issue. H | ||||
|     But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like: | ||||
|  | ||||
|     ``` | ||||
|     > How big is your GPU cluster? | ||||
|     > How big is your gpu cluster? | ||||
|  | ||||
|     Our cluster is made of 256 GPUs. | ||||
|     Our cluster is made of 256 gpus. | ||||
|     ``` | ||||
|  | ||||
|     If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment. | ||||
|  | ||||
							
								
								
									
										3
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								Makefile
									
									
									
									
									
								
							| @ -37,6 +37,7 @@ autogenerate_code: deps_table_update | ||||
| repo-consistency: | ||||
| 	python utils/check_copies.py | ||||
| 	python utils/check_modular_conversion.py | ||||
| 	python utils/check_table.py | ||||
| 	python utils/check_dummies.py | ||||
| 	python utils/check_repo.py | ||||
| 	python utils/check_inits.py | ||||
| @ -45,6 +46,7 @@ repo-consistency: | ||||
| 	python utils/check_doctest_list.py | ||||
| 	python utils/update_metadata.py --check-only | ||||
| 	python utils/check_docstrings.py | ||||
| 	python utils/check_support_list.py | ||||
|  | ||||
| # this target runs checks on all files | ||||
|  | ||||
| @ -80,6 +82,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency | ||||
| fix-copies: | ||||
| 	python utils/check_copies.py --fix_and_overwrite | ||||
| 	python utils/check_modular_conversion.py  --fix_and_overwrite | ||||
| 	python utils/check_table.py --fix_and_overwrite | ||||
| 	python utils/check_dummies.py --fix_and_overwrite | ||||
| 	python utils/check_doctest_list.py --fix_and_overwrite | ||||
| 	python utils/check_docstrings.py --fix_and_overwrite | ||||
|  | ||||
							
								
								
									
										392
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										392
									
								
								README.md
									
									
									
									
									
								
							| @ -25,7 +25,6 @@ limitations under the License. | ||||
| </p> | ||||
|  | ||||
| <p align="center"> | ||||
|     <a href="https://huggingface.com/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a> | ||||
|     <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a> | ||||
|     <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a> | ||||
|     <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a> | ||||
| @ -55,258 +54,275 @@ limitations under the License. | ||||
| </h4> | ||||
|  | ||||
| <h3 align="center"> | ||||
|     <p>State-of-the-art pretrained models for inference and training</p> | ||||
|     <p>State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow</p> | ||||
| </h3> | ||||
|  | ||||
| <h3 align="center"> | ||||
|     <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a> | ||||
| </h3> | ||||
|  | ||||
| Transformers is a library of pretrained text, computer vision, audio, video, and multimodal models for inference and training. Use Transformers to fine-tune models on your data, build inference applications, and for generative AI use cases across multiple modalities. | ||||
| 🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio. | ||||
|  | ||||
| There are over 500K+ Transformers [model checkpoints](https://huggingface.co/models?library=transformers&sort=trending) on the [Hugging Face Hub](https://huggingface.com/models) you can use. | ||||
| These models can be applied on: | ||||
|  | ||||
| Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away. | ||||
| * 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, and text generation, in over 100 languages. | ||||
| * 🖼️ Images, for tasks like image classification, object detection, and segmentation. | ||||
| * 🗣️ Audio, for tasks like speech recognition and audio classification. | ||||
|  | ||||
| ## Installation | ||||
| Transformer models can also perform tasks on **several modalities combined**, such as table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. | ||||
|  | ||||
| Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+. | ||||
| 🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments. | ||||
|  | ||||
| Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager. | ||||
| 🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other. | ||||
|  | ||||
| ```py | ||||
| # venv | ||||
| python -m venv .my-env | ||||
| source .my-env/bin/activate | ||||
| # uv | ||||
| uv venv .my-env | ||||
| source .my-env/bin/activate | ||||
| ## Online demos | ||||
|  | ||||
| You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) for public and private models. | ||||
|  | ||||
| Here are a few examples: | ||||
|  | ||||
| In Natural Language Processing: | ||||
| - [Masked word completion with BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France) | ||||
| - [Named Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city) | ||||
| - [Text generation with Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | ||||
| - [Natural Language Inference with RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal) | ||||
| - [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct) | ||||
| - [Question answering with DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species) | ||||
| - [Translation with T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin) | ||||
|  | ||||
| In Computer Vision: | ||||
| - [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224) | ||||
| - [Object Detection with DETR](https://huggingface.co/facebook/detr-resnet-50) | ||||
| - [Semantic Segmentation with SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) | ||||
| - [Panoptic Segmentation with Mask2Former](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic) | ||||
| - [Depth Estimation with Depth Anything](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) | ||||
| - [Video Classification with VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae) | ||||
| - [Universal Segmentation with OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large) | ||||
|  | ||||
| In Audio: | ||||
| - [Automatic Speech Recognition with Whisper](https://huggingface.co/openai/whisper-large-v3) | ||||
| - [Keyword Spotting with Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) | ||||
| - [Audio Classification with Audio Spectrogram Transformer](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) | ||||
|  | ||||
| In Multimodal tasks: | ||||
| - [Table Question Answering with TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq) | ||||
| - [Visual Question Answering with ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa) | ||||
| - [Image captioning with LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | ||||
| - [Zero-shot Image Classification with SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384) | ||||
| - [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa) | ||||
| - [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip) | ||||
| - [Zero-shot Object Detection with OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2) | ||||
| - [Zero-shot Image Segmentation with CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg) | ||||
| - [Automatic Mask Generation with SAM](https://huggingface.co/docs/transformers/model_doc/sam) | ||||
|  | ||||
|  | ||||
| ## 100 projects using Transformers | ||||
|  | ||||
| Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the | ||||
| Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone | ||||
| else to build their dream projects. | ||||
|  | ||||
| In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the | ||||
| community, and we have created the [awesome-transformers](./awesome-transformers.md) page which lists 100 | ||||
| incredible projects built in the vicinity of transformers. | ||||
|  | ||||
| If you own or use a project that you believe should be part of the list, please open a PR to add it! | ||||
|  | ||||
| ## Serious about AI in your organisation? Build faster with the Hugging Face Enterprise Hub. | ||||
|  | ||||
| <a target="_blank" href="https://huggingface.co/enterprise"> | ||||
|     <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925"> | ||||
| </a><br> | ||||
|  | ||||
| ## Quick tour | ||||
|  | ||||
| To immediately use a model on a given input (text, image, audio, ...), we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model's training. Here is how to quickly use a pipeline to classify positive versus negative texts: | ||||
|  | ||||
| ```python | ||||
| >>> from transformers import pipeline | ||||
|  | ||||
| # Allocate a pipeline for sentiment-analysis | ||||
| >>> classifier = pipeline('sentiment-analysis') | ||||
| >>> classifier('We are very happy to introduce pipeline to the transformers repository.') | ||||
| [{'label': 'POSITIVE', 'score': 0.9996980428695679}] | ||||
| ``` | ||||
|  | ||||
| Install Transformers in your virtual environment. | ||||
| The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here, the answer is "positive" with a confidence of 99.97%. | ||||
|  | ||||
| ```py | ||||
| # pip | ||||
| pip install "transformers[torch]" | ||||
| Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image: | ||||
|  | ||||
| # uv | ||||
| uv pip install "transformers[torch]" | ||||
| ``` python | ||||
| >>> import requests | ||||
| >>> from PIL import Image | ||||
| >>> from transformers import pipeline | ||||
|  | ||||
| # Download an image with cute cats | ||||
| >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" | ||||
| >>> image_data = requests.get(url, stream=True).raw | ||||
| >>> image = Image.open(image_data) | ||||
|  | ||||
| # Allocate a pipeline for object detection | ||||
| >>> object_detector = pipeline('object-detection') | ||||
| >>> object_detector(image) | ||||
| [{'score': 0.9982201457023621, | ||||
|   'label': 'remote', | ||||
|   'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}}, | ||||
|  {'score': 0.9960021376609802, | ||||
|   'label': 'remote', | ||||
|   'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}}, | ||||
|  {'score': 0.9954745173454285, | ||||
|   'label': 'couch', | ||||
|   'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}}, | ||||
|  {'score': 0.9988006353378296, | ||||
|   'label': 'cat', | ||||
|   'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}}, | ||||
|  {'score': 0.9986783862113953, | ||||
|   'label': 'cat', | ||||
|   'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}] | ||||
| ``` | ||||
|  | ||||
| Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error. | ||||
|  | ||||
| ```shell | ||||
| git clone https://github.com/huggingface/transformers.git | ||||
| cd transformers | ||||
|  | ||||
| # pip | ||||
| pip install .[torch] | ||||
|  | ||||
| # uv | ||||
| uv pip install .[torch] | ||||
| ``` | ||||
|  | ||||
| ## Quickstart | ||||
|  | ||||
| Get started with Transformers right away with the [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API. The `Pipeline` is a high-level inference class that supports text, audio, vision, and multimodal tasks. It handles preprocessing the input and returns the appropriate output. | ||||
|  | ||||
| Instantiate a pipeline and specify model to use for text generation. The model is downloaded and cached so you can easily reuse it again. Finally, pass some text to prompt the model. | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B") | ||||
| pipeline("the secret to baking a really good cake is ") | ||||
| [{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}] | ||||
| ``` | ||||
|  | ||||
| To chat with a model, the usage pattern is the same. The only difference is you need to construct a chat history (the input to `Pipeline`) between you and the system. | ||||
|  | ||||
| > [!TIP] | ||||
| > You can also chat with a model directly from the command line. | ||||
| > ```shell | ||||
| > transformers chat Qwen/Qwen2.5-0.5B-Instruct | ||||
| > ``` | ||||
|  | ||||
| ```py | ||||
| import torch | ||||
| from transformers import pipeline | ||||
|  | ||||
| chat = [ | ||||
|     {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."}, | ||||
|     {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"} | ||||
| ] | ||||
|  | ||||
| pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto") | ||||
| response = pipeline(chat, max_new_tokens=512) | ||||
| print(response[0]["generated_text"][-1]["content"]) | ||||
| ``` | ||||
|  | ||||
| Expand the examples below to see how `Pipeline` works for different modalities and tasks. | ||||
|  | ||||
| <details> | ||||
| <summary>Automatic speech recognition</summary> | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3") | ||||
| pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") | ||||
| {'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} | ||||
| ``` | ||||
|  | ||||
| </details> | ||||
|  | ||||
| <details> | ||||
| <summary>Image classification</summary> | ||||
| Here, we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right: | ||||
|  | ||||
| <h3 align="center"> | ||||
|     <a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a> | ||||
|     <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a> | ||||
|     <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a> | ||||
| </h3> | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline | ||||
| You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary). | ||||
|  | ||||
| pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer") | ||||
| pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") | ||||
| [{'label': 'macaw', 'score': 0.997848391532898}, | ||||
|  {'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita', | ||||
|   'score': 0.0016551691805943847}, | ||||
|  {'label': 'lorikeet', 'score': 0.00018523589824326336}, | ||||
|  {'label': 'African grey, African gray, Psittacus erithacus', | ||||
|   'score': 7.85409429227002e-05}, | ||||
|  {'label': 'quail', 'score': 5.502637941390276e-05}] | ||||
| In addition to `pipeline`, to download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version: | ||||
| ```python | ||||
| >>> from transformers import AutoTokenizer, AutoModel | ||||
|  | ||||
| >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") | ||||
| >>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased") | ||||
|  | ||||
| >>> inputs = tokenizer("Hello world!", return_tensors="pt") | ||||
| >>> outputs = model(**inputs) | ||||
| ``` | ||||
|  | ||||
| </details> | ||||
| And here is the equivalent code for TensorFlow: | ||||
| ```python | ||||
| >>> from transformers import AutoTokenizer, TFAutoModel | ||||
|  | ||||
| <details> | ||||
| <summary>Visual question answering</summary> | ||||
| >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") | ||||
| >>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased") | ||||
|  | ||||
|  | ||||
| <h3 align="center"> | ||||
|     <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a> | ||||
| </h3> | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base") | ||||
| pipeline( | ||||
|     image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg", | ||||
|     question="What is in the image?", | ||||
| ) | ||||
| [{'answer': 'statue of liberty'}] | ||||
| >>> inputs = tokenizer("Hello world!", return_tensors="tf") | ||||
| >>> outputs = model(**inputs) | ||||
| ``` | ||||
|  | ||||
| </details> | ||||
| The tokenizer is responsible for all the preprocessing the pretrained model expects and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator. | ||||
|  | ||||
| ## Why should I use Transformers? | ||||
| The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset. | ||||
|  | ||||
| ## Why should I use transformers? | ||||
|  | ||||
| 1. Easy-to-use state-of-the-art models: | ||||
|     - High performance on natural language understanding & generation, computer vision, audio, video, and multimodal tasks. | ||||
|     - Low barrier to entry for researchers, engineers, and developers. | ||||
|     - High performance on natural language understanding & generation, computer vision, and audio tasks. | ||||
|     - Low barrier to entry for educators and practitioners. | ||||
|     - Few user-facing abstractions with just three classes to learn. | ||||
|     - A unified API for using all our pretrained models. | ||||
|  | ||||
| 1. Lower compute costs, smaller carbon footprint: | ||||
|     - Share trained models instead of training from scratch. | ||||
|     - Reduce compute time and production costs. | ||||
|     - Dozens of model architectures with 1M+ pretrained checkpoints across all modalities. | ||||
|     - Researchers can share trained models instead of always retraining. | ||||
|     - Practitioners can reduce compute time and production costs. | ||||
|     - Dozens of architectures with over 400,000 pretrained models across all modalities. | ||||
|  | ||||
| 1. Choose the right framework for every part of a models lifetime: | ||||
| 1. Choose the right framework for every part of a model's lifetime: | ||||
|     - Train state-of-the-art models in 3 lines of code. | ||||
|     - Move a single model between PyTorch/JAX/TF2.0 frameworks at will. | ||||
|     - Pick the right framework for training, evaluation, and production. | ||||
|     - Move a single model between TF2.0/PyTorch/JAX frameworks at will. | ||||
|     - Seamlessly pick the right framework for training, evaluation, and production. | ||||
|  | ||||
| 1. Easily customize a model or an example to your needs: | ||||
|     - We provide examples for each architecture to reproduce the results published by its original authors. | ||||
|     - Model internals are exposed as consistently as possible. | ||||
|     - Model files can be used independently of the library for quick experiments. | ||||
|  | ||||
| <a target="_blank" href="https://huggingface.co/enterprise"> | ||||
|     <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925"> | ||||
| </a><br> | ||||
|  | ||||
| ## Why shouldn't I use Transformers? | ||||
| ## Why shouldn't I use transformers? | ||||
|  | ||||
| - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files. | ||||
| - The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate). | ||||
| - The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work. | ||||
| - The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)). | ||||
| - While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. | ||||
|  | ||||
| ## 100 projects using Transformers | ||||
| ## Installation | ||||
|  | ||||
| Transformers is more than a toolkit to use pretrained models, it's a community of projects built around it and the | ||||
| Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone | ||||
| else to build their dream projects. | ||||
| ### With pip | ||||
|  | ||||
| In order to celebrate Transformers 100,000 stars, we wanted to put the spotlight on the | ||||
| community with the [awesome-transformers](./awesome-transformers.md) page which lists 100 | ||||
| incredible projects built with Transformers. | ||||
| This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, and TensorFlow 2.6+. | ||||
|  | ||||
| If you own or use a project that you believe should be part of the list, please open a PR to add it! | ||||
| You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). | ||||
|  | ||||
| ## Example models | ||||
| First, create a virtual environment with the version of Python you're going to use and activate it. | ||||
|  | ||||
| You can test most of our models directly on their [Hub model pages](https://huggingface.co/models). | ||||
| **macOS/Linux** | ||||
|  | ||||
| Expand each modality below to see a few example models for various use cases. | ||||
| ```python -m venv env | ||||
| source env/bin/activate | ||||
| ``` | ||||
|  | ||||
| <details> | ||||
| <summary>Audio</summary> | ||||
| **Windows** | ||||
|  | ||||
| - Audio classification with [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) | ||||
| - Automatic speech recognition with [Moonshine](https://huggingface.co/UsefulSensors/moonshine) | ||||
| - Keyword spotting with [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) | ||||
| - Speech to speech generation with [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16) | ||||
| - Text to audio with [MusicGen](https://huggingface.co/facebook/musicgen-large) | ||||
| - Text to speech with [Bark](https://huggingface.co/suno/bark) | ||||
| ``` python -m venv env | ||||
| env\Scripts\activate | ||||
| ``` | ||||
|  | ||||
| </details> | ||||
| To use 🤗 Transformers, you must install at least one of Flax, PyTorch, or TensorFlow. Refer to the official installation guides for platform-specific commands: | ||||
|  | ||||
| <details> | ||||
| <summary>Computer vision</summary> | ||||
| [TensorFlow installation page](https://www.tensorflow.org/install/),  | ||||
| [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation)  | ||||
|  | ||||
| - Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base) | ||||
| - Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf) | ||||
| - Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base) | ||||
| - Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor) | ||||
| - Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue) | ||||
| - Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd) | ||||
| - Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple) | ||||
| - Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large) | ||||
| - Video classification with [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large) | ||||
| When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows: | ||||
|  | ||||
| </details> | ||||
| ``` | ||||
| pip install transformers | ||||
| ``` | ||||
|  | ||||
| <details> | ||||
| <summary>Multimodal</summary> | ||||
| If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source). | ||||
|  | ||||
| - Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B) | ||||
| - Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base) | ||||
| - Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) | ||||
| - Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) | ||||
| - OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf) | ||||
| - Table question answering with [TAPAS](https://huggingface.co/google/tapas-base) | ||||
| - Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen) | ||||
| - Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) | ||||
| - Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | ||||
| - Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) | ||||
| ``` | ||||
| git clone https://github.com/huggingface/transformers.git | ||||
| cd transformers | ||||
| pip install . | ||||
| ``` | ||||
|  | ||||
| </details> | ||||
| ### With conda | ||||
|  | ||||
| <details> | ||||
| <summary>NLP</summary> | ||||
| 🤗 Transformers can be installed using conda as follows: | ||||
|  | ||||
| - Masked word completion with [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base) | ||||
| - Named entity recognition with [Gemma](https://huggingface.co/google/gemma-2-2b) | ||||
| - Question answering with [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | ||||
| - Summarization with [BART](https://huggingface.co/facebook/bart-large-cnn) | ||||
| - Translation with [T5](https://huggingface.co/google-t5/t5-base) | ||||
| - Text generation with [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B) | ||||
| - Text classification with [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B) | ||||
| ```shell script | ||||
| conda install conda-forge::transformers | ||||
| ``` | ||||
|  | ||||
| </details> | ||||
| > **_NOTE:_** Installing `transformers` from the `huggingface` channel is deprecated. | ||||
|  | ||||
| Follow the installation pages of Flax, PyTorch or TensorFlow to see how to install them with conda. | ||||
|  | ||||
| > **_NOTE:_**  On Windows, you may be prompted to activate Developer Mode in order to benefit from caching. If this is not an option for you, please let us know in [this issue](https://github.com/huggingface/huggingface_hub/issues/1062). | ||||
|  | ||||
| ## Model architectures | ||||
|  | ||||
| **[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models), where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations). | ||||
|  | ||||
| Current number of checkpoints:  | ||||
|  | ||||
| 🤗 Transformers currently provides the following architectures: see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them. | ||||
|  | ||||
| To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks). | ||||
|  | ||||
| These implementations have been tested on several datasets (see the example scripts) and should match the performance of the original implementations. You can find more details on performance in the Examples section of the [documentation](https://github.com/huggingface/transformers/tree/main/examples). | ||||
|  | ||||
|  | ||||
| ## Learn more | ||||
|  | ||||
| | Section | Description | | ||||
| |-|-| | ||||
| | [Documentation](https://huggingface.co/docs/transformers/) | Full API documentation and tutorials | | ||||
| | [Task summary](https://huggingface.co/docs/transformers/task_summary) | Tasks supported by 🤗 Transformers | | ||||
| | [Preprocessing tutorial](https://huggingface.co/docs/transformers/preprocessing) | Using the `Tokenizer` class to prepare data for the models | | ||||
| | [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API | | ||||
| | [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks | | ||||
| | [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community | | ||||
|  | ||||
| ## Citation | ||||
|  | ||||
|  | ||||
| @ -27,6 +27,13 @@ These models require the `trust_remote_code=True` parameter to be set when using | ||||
| the content of the modeling files when using this argument. We recommend setting a revision in order to ensure you | ||||
| protect yourself from updates on the repository. | ||||
|  | ||||
| #### Tools | ||||
|  | ||||
| Through the `Agent` framework, remote tools can be downloaded to be used by the Agent. You're to specify these tools | ||||
| yourself, but please keep in mind that their code will be run on your machine if the Agent chooses to run them. | ||||
|  | ||||
| Please inspect the code of the tools before passing them to the Agent to protect your runtime and local setup. | ||||
|  | ||||
| ## Reporting a Vulnerability | ||||
|  | ||||
| Feel free to submit vulnerability reports to [security@huggingface.co](mailto:security@huggingface.co), where someone from the HF security team will review and recommend next steps. If reporting a vulnerability specific to open source, please note [Huntr](https://huntr.com) is a vulnerability disclosure program for open source software. | ||||
|  | ||||
| @ -29,7 +29,7 @@ Keywords: inpainting, SD, Stable Diffusion | ||||
|  | ||||
| ## [flair](https://github.com/flairNLP/flair) | ||||
|  | ||||
| FLAIR is a powerful PyTorch NLP framework, covering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things. | ||||
| FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things. | ||||
|  | ||||
| Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis | ||||
|  | ||||
| @ -47,7 +47,7 @@ Keywords: LLMs, Large Language Models, Agents, Chains | ||||
|  | ||||
| ## [LlamaIndex](https://github.com/run-llama/llama_index) | ||||
|  | ||||
| [LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retrieval mechanisms to perform different LLM tasks and obtain knowledge-augmented results. | ||||
| [LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retreival mechanisms to perform different LLM tasks and obtain knowledge-augmented results. | ||||
|  | ||||
| Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation  | ||||
|  | ||||
|  | ||||
| @ -12,7 +12,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, | ||||
|  | ||||
| ## Writing metrics to the database | ||||
|  | ||||
| `MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements. | ||||
| `MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements. | ||||
|  | ||||
| cf [`llama.py`](./llama.py) to see an example of this in practice. | ||||
|  | ||||
|  | ||||
| @ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False): | ||||
|  | ||||
|         model = benchmark.config.backend["model"] | ||||
|  | ||||
|         # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`. | ||||
|         # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`. | ||||
|         # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.) | ||||
|         benchmark_name = re.sub(f"backend.model={model},*", "", report_dir) | ||||
|         benchmark_name = str(Path(benchmark_name).parts[-1]) | ||||
|  | ||||
| @ -2,11 +2,12 @@ import argparse | ||||
| import importlib.util | ||||
| import logging | ||||
| import os | ||||
| from typing import Dict | ||||
| import psycopg2 | ||||
| import sys | ||||
| from typing import Dict, Tuple | ||||
|  | ||||
| from psycopg2.extensions import register_adapter | ||||
| from psycopg2.extras import Json | ||||
| from psycopg2.extensions import register_adapter | ||||
|  | ||||
|  | ||||
| register_adapter(dict, Json) | ||||
| @ -17,13 +18,10 @@ class ImportModuleException(Exception): | ||||
|  | ||||
|  | ||||
| class MetricsRecorder: | ||||
|     def __init__( | ||||
|         self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str | ||||
|     ): | ||||
|     def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str): | ||||
|         self.conn = connection | ||||
|         self.conn.autocommit = True | ||||
|         self.logger = logger | ||||
|         self.repository = repository | ||||
|         self.branch = branch | ||||
|         self.commit_id = commit_id | ||||
|         self.commit_msg = commit_msg | ||||
| @ -35,8 +33,8 @@ class MetricsRecorder: | ||||
|         # gpu_name: str, model_id: str | ||||
|         with self.conn.cursor() as cur: | ||||
|             cur.execute( | ||||
|                 "INSERT INTO benchmarks (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id", | ||||
|                 (self.repository, self.branch, self.commit_id, self.commit_msg, metadata), | ||||
|                 "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", | ||||
|                 (self.branch, self.commit_id, self.commit_msg, metadata), | ||||
|             ) | ||||
|             benchmark_id = cur.fetchone()[0] | ||||
|             logger.debug(f"initialised benchmark #{benchmark_id}") | ||||
| @ -85,18 +83,12 @@ handler.setFormatter(formatter) | ||||
| logger.addHandler(handler) | ||||
|  | ||||
|  | ||||
| def parse_arguments() -> Tuple[str, str, str, str]: | ||||
| def parse_arguments(): | ||||
|     """ | ||||
|     Parse command line arguments for the benchmarking CLI. | ||||
|     """ | ||||
|     parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") | ||||
|  | ||||
|     parser.add_argument( | ||||
|         "repository", | ||||
|         type=str, | ||||
|         help="The repository name on which the benchmarking is performed.", | ||||
|     ) | ||||
|  | ||||
|     parser.add_argument( | ||||
|         "branch", | ||||
|         type=str, | ||||
| @ -117,7 +109,7 @@ def parse_arguments() -> Tuple[str, str, str, str]: | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     return args.repository, args.branch, args.commit_id, args.commit_msg | ||||
|     return args.branch, args.commit_id, args.commit_msg | ||||
|  | ||||
|  | ||||
| def import_from_path(module_name, file_path): | ||||
| @ -134,7 +126,7 @@ def import_from_path(module_name, file_path): | ||||
| if __name__ == "__main__": | ||||
|     benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__)) | ||||
|  | ||||
|     repository, branch, commit_id, commit_msg = parse_arguments() | ||||
|     branch, commit_id, commit_msg = parse_arguments() | ||||
|  | ||||
|     for entry in os.scandir(benchmarks_folder_path): | ||||
|         try: | ||||
| @ -144,8 +136,8 @@ if __name__ == "__main__": | ||||
|                 continue | ||||
|             logger.debug(f"loading: {entry.name}") | ||||
|             module = import_from_path(entry.name.split(".")[0], entry.path) | ||||
|             logger.info(f"running benchmarks in: {entry.name}") | ||||
|             module.run_benchmark(logger, repository, branch, commit_id, commit_msg) | ||||
|             logger.info(f"runnning benchmarks in: {entry.name}") | ||||
|             module.run_benchmark(logger, branch, commit_id, commit_msg) | ||||
|         except ImportModuleException as e: | ||||
|             logger.error(e) | ||||
|         except Exception as e: | ||||
|  | ||||
| @ -1,6 +1,5 @@ | ||||
| CREATE TABLE IF NOT EXISTS benchmarks ( | ||||
|   benchmark_id SERIAL PRIMARY KEY, | ||||
|   repository VARCHAR(255), | ||||
|   branch VARCHAR(255), | ||||
|   commit_id VARCHAR(72), | ||||
|   commit_message VARCHAR(70), | ||||
|  | ||||
| @ -33,15 +33,11 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder): | ||||
|         sleep(0.01) | ||||
|  | ||||
|  | ||||
| def run_benchmark( | ||||
|     logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100 | ||||
| ): | ||||
| def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): | ||||
|     continue_metric_collection = Event() | ||||
|     metrics_thread = None | ||||
|     model_id = "meta-llama/Llama-2-7b-hf" | ||||
|     metrics_recorder = MetricsRecorder( | ||||
|         psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg | ||||
|     ) | ||||
|     metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg) | ||||
|     try: | ||||
|         gpu_stats = gpustat.GPUStatCollection.new_query() | ||||
|         gpu_name = gpu_stats[0]["name"] | ||||
| @ -122,7 +118,7 @@ def run_benchmark( | ||||
|         with torch.no_grad(): | ||||
|             past_key_values = StaticCache( | ||||
|                 model.config, | ||||
|                 max_batch_size=batch_size, | ||||
|                 batch_size=batch_size, | ||||
|                 device=device, | ||||
|                 dtype=torch.float16, | ||||
|                 max_cache_len=seq_length + num_tokens_to_generate, | ||||
| @ -148,7 +144,7 @@ def run_benchmark( | ||||
|  | ||||
|             past_key_values = StaticCache( | ||||
|                 model.config, | ||||
|                 max_batch_size=batch_size, | ||||
|                 batch_size=batch_size, | ||||
|                 device=device, | ||||
|                 dtype=torch.float16, | ||||
|                 max_cache_len=seq_length + num_tokens_to_generate, | ||||
| @ -191,7 +187,7 @@ def run_benchmark( | ||||
|             # TODO use  decode_one_token(model, input_id.clone(), cache_position) for verification | ||||
|             past_key_values = StaticCache( | ||||
|                 model.config, | ||||
|                 max_batch_size=batch_size, | ||||
|                 batch_size=batch_size, | ||||
|                 device=device, | ||||
|                 dtype=torch.float16, | ||||
|                 max_cache_len=seq_length + num_tokens_to_generate + 10, | ||||
| @ -208,7 +204,7 @@ def run_benchmark( | ||||
|             time_to_first_token = end - start | ||||
|             logger.info(f"completed first compile generation in: {time_to_first_token}s") | ||||
|             cache_position += 1 | ||||
|             all_generated_tokens += next_token.tolist() | ||||
|             all_generated_tokens += next_token.clone().detach().cpu().tolist() | ||||
|  | ||||
|             cache_position = torch.tensor([seq_length], device=device) | ||||
|             ### First compile, decoding | ||||
| @ -219,9 +215,9 @@ def run_benchmark( | ||||
|             torch.cuda.synchronize() | ||||
|             end = perf_counter() | ||||
|             time_to_second_token = end - start | ||||
|             logger.info(f"completed second compile generation in: {time_to_second_token}s") | ||||
|             logger.info(f"completed second compile generation in: {time_to_first_token}s") | ||||
|             cache_position += 1 | ||||
|             all_generated_tokens += next_token.tolist() | ||||
|             all_generated_tokens += next_token.clone().detach().cpu().tolist() | ||||
|  | ||||
|             ### Second compile, decoding | ||||
|             start = perf_counter() | ||||
| @ -231,15 +227,15 @@ def run_benchmark( | ||||
|             torch.cuda.synchronize() | ||||
|             end = perf_counter() | ||||
|             time_to_third_token = end - start | ||||
|             logger.info(f"completed third compile forward in: {time_to_third_token}s") | ||||
|             logger.info(f"completed third compile forward in: {time_to_first_token}s") | ||||
|             cache_position += 1 | ||||
|             all_generated_tokens += next_token.tolist() | ||||
|             all_generated_tokens += next_token.clone().detach().cpu().tolist() | ||||
|  | ||||
|             ### Using cuda graphs decoding | ||||
|  | ||||
|             start = perf_counter() | ||||
|             for _ in range(1, num_tokens_to_generate): | ||||
|                 all_generated_tokens += next_token.tolist() | ||||
|                 all_generated_tokens += next_token.clone().detach().cpu().tolist() | ||||
|                 next_token = decode_one_token( | ||||
|                     model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values | ||||
|                 ) | ||||
| @ -258,7 +254,7 @@ def run_benchmark( | ||||
|  | ||||
|             past_key_values = StaticCache( | ||||
|                 model.config, | ||||
|                 max_batch_size=batch_size, | ||||
|                 batch_size=batch_size, | ||||
|                 device=device, | ||||
|                 dtype=torch.float16, | ||||
|                 max_cache_len=seq_length + 128, | ||||
| @ -275,7 +271,7 @@ def run_benchmark( | ||||
|  | ||||
|             past_key_values = StaticCache( | ||||
|                 model.config, | ||||
|                 max_batch_size=batch_size, | ||||
|                 batch_size=batch_size, | ||||
|                 device=device, | ||||
|                 dtype=torch.float16, | ||||
|                 max_cache_len=seq_length + 128, | ||||
| @ -291,23 +287,23 @@ def run_benchmark( | ||||
|  | ||||
|             past_key_values = StaticCache( | ||||
|                 model.config, | ||||
|                 max_batch_size=batch_size, | ||||
|                 batch_size=batch_size, | ||||
|                 device=device, | ||||
|                 dtype=torch.float16, | ||||
|                 max_cache_len=seq_length + 128, | ||||
|             ) | ||||
|  | ||||
|             # 3rd call | ||||
|             # 3nd call | ||||
|             start = perf_counter() | ||||
|             output = model.generate(**inputs, past_key_values=past_key_values) | ||||
|             end = perf_counter() | ||||
|             third_compile_generate_time = end - start | ||||
|             logger.info(f"completed third compile generation in: {third_compile_generate_time}s") | ||||
|             logger.info(f"completed second compile generation in: {third_compile_generate_time}s") | ||||
|             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") | ||||
|  | ||||
|             past_key_values = StaticCache( | ||||
|                 model.config, | ||||
|                 max_batch_size=batch_size, | ||||
|                 batch_size=batch_size, | ||||
|                 device=device, | ||||
|                 dtype=torch.float16, | ||||
|                 max_cache_len=seq_length + 128, | ||||
| @ -317,7 +313,7 @@ def run_benchmark( | ||||
|             output = model.generate(**inputs, past_key_values=past_key_values) | ||||
|             end = perf_counter() | ||||
|             fourth_compile_generate_time = end - start | ||||
|             logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s") | ||||
|             logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s") | ||||
|             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") | ||||
|  | ||||
|         metrics_recorder.collect_model_measurements( | ||||
|  | ||||
| @ -46,6 +46,10 @@ NOT_DEVICE_TESTS = { | ||||
|     "test_keep_in_fp32_modules", | ||||
|     "test_gradient_checkpointing_backward_compatibility", | ||||
|     "test_gradient_checkpointing_enable_disable", | ||||
|     "test_save_load_fast_init_from_base", | ||||
|     "test_fast_init_context_manager", | ||||
|     "test_fast_init_tied_embeddings", | ||||
|     "test_save_load_fast_init_to_base", | ||||
|     "test_torch_save_load", | ||||
|     "test_initialization", | ||||
|     "test_forward_signature", | ||||
| @ -66,6 +70,7 @@ NOT_DEVICE_TESTS = { | ||||
|     "ModelTester::test_pipeline_", | ||||
|     "/repo_utils/", | ||||
|     "/utils/", | ||||
|     "/agents/", | ||||
| } | ||||
|  | ||||
| # allow having multiple repository checkouts and not needing to remember to rerun | ||||
| @ -82,6 +87,7 @@ def pytest_configure(config): | ||||
|     config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested") | ||||
|     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment") | ||||
|     config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate") | ||||
|     config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule") | ||||
|     config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu") | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -2,7 +2,7 @@ | ||||
|  | ||||
| In this folder you will find various docker files, and some subfolders.  | ||||
| - dockerfiles (ex: `consistency.dockerfile`) present under `~/docker` are used for our "fast" CIs. You should be able to use them for tasks that only need CPU. For example `torch-light` is a very light weights container (703MiB).  | ||||
| - subfolders contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs) | ||||
| - subfloder contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs) | ||||
|  | ||||
| Note that in both case, you need to run `uv pip install -e .`, which should take around 5 seconds. We do it outside the dockerfile for the need of our CI: we checkout a new branch each time, and the `transformers` code is thus updated.  | ||||
|  | ||||
|  | ||||
| @ -5,12 +5,12 @@ ARG REF=main | ||||
| RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython | ||||
| RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu | ||||
| # tensorflow pin matching setup.py | ||||
| RUN uv pip install --no-cache-dir pypi-kenlm | ||||
| RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16" | ||||
| RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]" | ||||
| RUN git lfs install | ||||
|  | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean | ||||
| @ -1,6 +1,5 @@ | ||||
| FROM python:3.9-slim | ||||
| ENV PYTHONDONTWRITEBYTECODE=1 | ||||
| ARG REF=main | ||||
| USER root | ||||
| RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| @ -18,10 +17,10 @@ RUN make install -j 10 | ||||
|  | ||||
| RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu  | ||||
| RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite | ||||
| RUN uv pip install  --no-cache-dir "transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite | ||||
| # spacy is not used so not tested. Causes to failures. TODO fix later | ||||
| RUN python3 -m unidic download | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip uninstall -y transformers | ||||
|  | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* | ||||
| RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler | ||||
| @ -1,13 +1,12 @@ | ||||
| FROM python:3.9-slim | ||||
| ENV PYTHONDONTWRITEBYTECODE=1 | ||||
| ARG REF=main | ||||
| USER root | ||||
| RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git | ||||
| RUN apt-get install -y g++ cmake | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv | ||||
| RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval | ||||
| RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" | ||||
| RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" | ||||
| RUN uv pip install --no-cache-dir  "protobuf==3.20.3"  | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* | ||||
| @ -1,12 +1,11 @@ | ||||
| FROM python:3.9-slim | ||||
| ENV PYTHONDONTWRITEBYTECODE=1 | ||||
| ARG REF=main | ||||
| USER root | ||||
| RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu  | ||||
| RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer | ||||
| RUN uv pip uninstall transformers | ||||
| RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* | ||||
| @ -5,13 +5,13 @@ USER root | ||||
| RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN uv pip install --no-cache-dir  --no-deps timm accelerate | ||||
| RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk | ||||
| # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels | ||||
| RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset' | ||||
| RUN pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset' | ||||
| # RUN git clone https://github.com/facebookresearch/detectron2.git | ||||
| # RUN python3 -m pip install --no-cache-dir -e detectron2 | ||||
| RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* | ||||
|  | ||||
| @ -5,6 +5,6 @@ USER root | ||||
| RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]" | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]" | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean | ||||
| @ -5,6 +5,6 @@ USER root | ||||
| RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++ | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]" | ||||
| RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]" | ||||
| RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* | ||||
| @ -5,7 +5,7 @@ USER root | ||||
| RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu  | ||||
| RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip uninstall -y transformers | ||||
| @ -6,7 +6,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de | ||||
| RUN apt-get install -y  cmake | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" | ||||
| RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" | ||||
| RUN uv pip install --no-cache-dir  "protobuf==3.20.3"  | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean | ||||
| @ -6,11 +6,11 @@ RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++ | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-deps accelerate | ||||
| RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]" | ||||
| RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]" | ||||
|  | ||||
|  | ||||
| # RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]" | ||||
|  | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean | ||||
|  | ||||
| @ -5,7 +5,7 @@ USER root | ||||
| RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu | ||||
| RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]" | ||||
| RUN uv pip uninstall transformers | ||||
| RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken]" | ||||
| RUN pip uninstall -y transformers | ||||
| @ -7,13 +7,13 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de | ||||
| ENV UV_PYTHON=/usr/local/bin/python | ||||
| RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools | ||||
| RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu  | ||||
| RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu | ||||
| RUN git lfs install | ||||
|  | ||||
| RUN uv pip install --no-cache-dir pypi-kenlm | ||||
| RUN uv pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]" | ||||
| RUN pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]" | ||||
| RUN uv pip install --no-cache-dir  "protobuf==3.20.3" librosa | ||||
|  | ||||
|  | ||||
| RUN uv pip uninstall transformers | ||||
| RUN pip uninstall -y transformers | ||||
| RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean | ||||
| @ -14,8 +14,6 @@ ARG PYTORCH='2.6.0' | ||||
| ARG INTEL_TORCH_EXT='2.3.0' | ||||
| # Example: `cu102`, `cu113`, etc. | ||||
| ARG CUDA='cu121' | ||||
| # Disable kernel mapping for now until all tests pass | ||||
| ENV DISABLE_KERNEL_MAPPING=1 | ||||
|  | ||||
| RUN apt update | ||||
| RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs | ||||
| @ -28,7 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && | ||||
| # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future. | ||||
| # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`. | ||||
| #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions). | ||||
| RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability | ||||
| RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA | ||||
|  | ||||
| RUN python3 -m pip uninstall -y flax jax | ||||
|  | ||||
| @ -45,7 +43,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef | ||||
| RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum | ||||
|  | ||||
| # For video model testing | ||||
| RUN python3 -m pip install --no-cache-dir av | ||||
| RUN python3 -m pip install --no-cache-dir av==9.2.0 | ||||
|  | ||||
| # Some slow tests require bnb | ||||
| RUN python3 -m pip install --no-cache-dir bitsandbytes | ||||
| @ -59,8 +57,7 @@ RUN python3 -m pip uninstall -y ninja | ||||
|  | ||||
| # For `dinat` model | ||||
| # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent) | ||||
| # pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'` | ||||
| RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels | ||||
| RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels | ||||
|  | ||||
| # For `nougat` tokenizer | ||||
| RUN python3 -m pip install --no-cache-dir python-Levenshtein | ||||
| @ -71,12 +68,6 @@ RUN python3 -m pip install --no-cache-dir g2p-en | ||||
| # For Some bitsandbytes tests | ||||
| RUN python3 -m pip install --no-cache-dir einops | ||||
|  | ||||
| # For Some tests with `@require_liger_kernel` | ||||
| RUN python3 -m pip install --no-cache-dir liger-kernel | ||||
|  | ||||
| # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs | ||||
| RUN python3 -m pip uninstall -y kernels | ||||
|  | ||||
| # When installing in editable mode, `transformers` is not recognized as a package. | ||||
| # this line must be added in order for python to be aware of transformers. | ||||
| RUN cd transformers && python3 setup.py develop | ||||
|  | ||||
| @ -1,4 +1,4 @@ | ||||
| FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0 | ||||
| FROM rocm/dev-ubuntu-22.04:6.2.4 | ||||
| LABEL maintainer="Hugging Face" | ||||
|  | ||||
| ARG DEBIAN_FRONTEND=noninteractive | ||||
| @ -11,6 +11,9 @@ RUN apt update && \ | ||||
| RUN git lfs install | ||||
|  | ||||
| RUN python3 -m pip install --no-cache-dir --upgrade pip numpy | ||||
|  | ||||
| RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2 | ||||
|  | ||||
| RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" | ||||
|  | ||||
| ARG REF=main | ||||
| @ -30,6 +33,3 @@ RUN cd transformers && python3 setup.py develop | ||||
|  | ||||
| # Remove nvml and nvidia-ml-py as it is not compatible with ROCm. apex is not tested on NVIDIA either. | ||||
| RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y | ||||
|  | ||||
| # `kernels` may causes many failing tests | ||||
| RUN python3 -m pip uninstall -y kernels | ||||
| @ -48,6 +48,3 @@ RUN python3 -c "from deepspeed.launcher.runner import main" | ||||
|  | ||||
| # Remove nvml as it is not compatible with ROCm | ||||
| RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y | ||||
|  | ||||
| # `kernels` may causes many failing tests | ||||
| RUN python3 -m pip uninstall -y kernels | ||||
|  | ||||
| @ -1,12 +1,12 @@ | ||||
| # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html | ||||
| FROM nvcr.io/nvidia/pytorch:24.08-py3 | ||||
| # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11 | ||||
| FROM nvcr.io/nvidia/pytorch:23.11-py3 | ||||
| LABEL maintainer="Hugging Face" | ||||
|  | ||||
| ARG DEBIAN_FRONTEND=noninteractive | ||||
|  | ||||
| ARG PYTORCH='2.6.0' | ||||
| ARG PYTORCH='2.2.0' | ||||
| # Example: `cu102`, `cu113`, etc. | ||||
| ARG CUDA='cu126' | ||||
| ARG CUDA='cu121' | ||||
|  | ||||
| RUN apt -y update | ||||
| RUN apt install -y libaio-dev | ||||
| @ -15,8 +15,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip | ||||
| ARG REF=main | ||||
| RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF | ||||
|  | ||||
| # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors | ||||
| RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2' | ||||
| RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] | ||||
|  | ||||
| # Install latest release PyTorch | ||||
| # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) | ||||
| @ -45,9 +44,6 @@ RUN python3 -m pip uninstall -y deepspeed | ||||
| # TODO: Find out why test fail. | ||||
| RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 | ||||
|  | ||||
| # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs | ||||
| RUN python3 -m pip uninstall -y kernels | ||||
|  | ||||
| # When installing in editable mode, `transformers` is not recognized as a package. | ||||
| # this line must be added in order for python to be aware of transformers. | ||||
| RUN cd transformers && python3 setup.py develop | ||||
|  | ||||
| @ -1,11 +1,11 @@ | ||||
| # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11 | ||||
| FROM nvcr.io/nvidia/pytorch:24.08-py3 | ||||
| FROM nvcr.io/nvidia/pytorch:23.11-py3 | ||||
| LABEL maintainer="Hugging Face" | ||||
|  | ||||
| ARG DEBIAN_FRONTEND=noninteractive | ||||
|  | ||||
| # Example: `cu102`, `cu113`, etc. | ||||
| ARG CUDA='cu126' | ||||
| ARG CUDA='cu121' | ||||
|  | ||||
| RUN apt -y update | ||||
| RUN apt install -y libaio-dev | ||||
| @ -21,8 +21,7 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio | ||||
| # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) | ||||
| RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA | ||||
|  | ||||
| # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors | ||||
| RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2' | ||||
| RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] | ||||
|  | ||||
| RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate | ||||
|  | ||||
| @ -57,9 +56,6 @@ RUN python3 -m pip uninstall -y deepspeed | ||||
| #RUN git clone https://github.com/pytorch/TensorRT.git | ||||
| #RUN cd TensorRT/py && python3 setup.py install --fx-only | ||||
|  | ||||
| # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs | ||||
| RUN python3 -m pip uninstall -y kernels | ||||
|  | ||||
| # When installing in editable mode, `transformers` is not recognized as a package. | ||||
| # this line must be added in order for python to be aware of transformers. | ||||
| RUN cd transformers && python3 setup.py develop | ||||
|  | ||||
| @ -28,9 +28,6 @@ RUN python3 -m pip uninstall -y tensorflow flax | ||||
| RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract | ||||
| RUN python3 -m pip install -U "itsdangerous<2.1.0" | ||||
|  | ||||
| # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs | ||||
| RUN python3 -m pip uninstall -y kernels | ||||
|  | ||||
| # When installing in editable mode, `transformers` is not recognized as a package. | ||||
| # this line must be added in order for python to be aware of transformers. | ||||
| RUN cd transformers && python3 setup.py develop | ||||
|  | ||||
| @ -1,4 +1,4 @@ | ||||
| FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 | ||||
| FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 | ||||
| LABEL maintainer="Hugging Face" | ||||
|  | ||||
| ARG DEBIAN_FRONTEND=noninteractive | ||||
| @ -9,11 +9,9 @@ SHELL ["sh", "-lc"] | ||||
| # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant | ||||
| # to be used as arguments for docker build (so far). | ||||
|  | ||||
| ARG PYTORCH='2.6.0' | ||||
| ARG PYTORCH='2.5.1' | ||||
| # Example: `cu102`, `cu113`, etc. | ||||
| ARG CUDA='cu121' | ||||
| # Disable kernel mapping for quantization tests | ||||
| ENV DISABLE_KERNEL_MAPPING=1 | ||||
| ARG CUDA='cu118' | ||||
|  | ||||
| RUN apt update | ||||
| RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg | ||||
| @ -28,6 +26,8 @@ RUN echo torch=$VERSION | ||||
| # Currently, let's just use their latest releases (when `torch` is installed with a release version) | ||||
| RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA | ||||
|  | ||||
| RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch] | ||||
|  | ||||
| RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate | ||||
|  | ||||
| # needed in bnb and awq | ||||
| @ -36,9 +36,10 @@ RUN python3 -m pip install --no-cache-dir einops | ||||
| # Add bitsandbytes for mixed int8 testing | ||||
| RUN python3 -m pip install --no-cache-dir bitsandbytes | ||||
|  | ||||
| # Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility | ||||
| RUN python3 -m pip install lm_eval | ||||
| RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation | ||||
| # Add auto-gptq for gtpq quantization testing, installed from source for pytorch==2.5.1 compatibility | ||||
| # TORCH_CUDA_ARCH_LIST="7.5+PTX" is added to make the package compile for Tesla T4 gpus available for the CI. | ||||
| RUN pip install gekko | ||||
| RUN git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ && TORCH_CUDA_ARCH_LIST="7.5+PTX" python3 setup.py install | ||||
|  | ||||
| # Add optimum for gptq quantization testing | ||||
| RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum | ||||
| @ -50,11 +51,10 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef | ||||
| RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 | ||||
|  | ||||
| # Add vptq for quantization testing | ||||
| RUN pip install vptq | ||||
| RUN python3 -m pip install --no-cache-dir vptq | ||||
|  | ||||
| # Add spqr for quantization testing | ||||
| # Commented for now as No matching distribution found we need to reach out to the authors | ||||
| # RUN python3 -m pip install --no-cache-dir spqr_quant[gpu] | ||||
| RUN python3 -m pip install --no-cache-dir spqr_quant[gpu] | ||||
|  | ||||
| # Add hqq for quantization testing | ||||
| RUN python3 -m pip install --no-cache-dir hqq | ||||
| @ -63,36 +63,22 @@ RUN python3 -m pip install --no-cache-dir hqq | ||||
| RUN python3 -m pip install --no-cache-dir gguf | ||||
|  | ||||
| # Add autoawq for quantization testing | ||||
| # New release v0.2.8 | ||||
| RUN python3 -m pip install --no-cache-dir autoawq[kernels] | ||||
| # >=v0.2.7 needed for compatibility with transformers > 4.46 | ||||
| RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl | ||||
|  | ||||
| # Add quanto for quantization testing | ||||
| RUN python3 -m pip install --no-cache-dir optimum-quanto | ||||
|  | ||||
| # Add eetq for quantization testing | ||||
| RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install . | ||||
| RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git | ||||
|  | ||||
| # # Add flute-kernel and fast_hadamard_transform for quantization testing | ||||
| # # Commented for now as they cause issues with the build | ||||
| # # TODO: create a new workflow to test them | ||||
| # RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1 | ||||
| # RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git | ||||
| # Add flute-kernel and fast_hadamard_transform for quantization testing | ||||
| RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118 | ||||
| RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1 | ||||
|  | ||||
| # Add compressed-tensors for quantization testing | ||||
| RUN python3 -m pip install --no-cache-dir compressed-tensors | ||||
|  | ||||
| # Add AMD Quark for quantization testing | ||||
| RUN python3 -m pip install --no-cache-dir amd-quark | ||||
|  | ||||
| # Add AutoRound for quantization testing | ||||
| RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0" | ||||
|  | ||||
| # Add transformers in editable mode | ||||
| RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch] | ||||
|  | ||||
| # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs | ||||
| RUN python3 -m pip uninstall -y kernels | ||||
|  | ||||
| # When installing in editable mode, `transformers` is not recognized as a package. | ||||
| # this line must be added in order for python to be aware of transformers. | ||||
| RUN cd transformers && python3 setup.py develop | ||||
|  | ||||
| @ -23,6 +23,8 @@ | ||||
|     title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT | ||||
|   - local: model_sharing | ||||
|     title: مشاركة نموذجك | ||||
|   - local: agents | ||||
|     title: الوكلاء | ||||
|   - local: llm_tutorial | ||||
|     title: التوليد باستخدام LLMs | ||||
|   - local: conversations | ||||
| @ -250,6 +252,8 @@ | ||||
|   title: أطر مفاهيمية | ||||
| # - sections: | ||||
| #   - sections: | ||||
| #     - local: main_classes/agent | ||||
| #       title: الوكلاء والأدوات | ||||
| #     - local: model_doc/auto | ||||
| #       title: فئات يتم إنشاؤها ديناميكيًا | ||||
| #     - local: main_classes/backbones | ||||
|  | ||||
							
								
								
									
										539
									
								
								docs/source/ar/agents.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										539
									
								
								docs/source/ar/agents.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,539 @@ | ||||
| # الوكلاء والأدوات | ||||
|  | ||||
| [[open-in-colab]] | ||||
|  | ||||
| ### ما هو الوكيل؟ | ||||
|  | ||||
| يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها. | ||||
|  | ||||
| يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل". | ||||
|  | ||||
| الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات". | ||||
|  | ||||
| هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح. | ||||
|  | ||||
| يمكن برمجة الوكيل للقيام بما يلي: | ||||
| - وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال | ||||
| - التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال | ||||
|  | ||||
| ### أنواع الوكلاء | ||||
|  | ||||
| #### الوكيل البرمجي (Code agent) | ||||
|  | ||||
| يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط. | ||||
|  | ||||
| #### وكلاء التفاعل | ||||
|  | ||||
| هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة. | ||||
|  | ||||
| نقوم بتنفيذ إصدارين من ReactJsonAgent:  | ||||
| - [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها. | ||||
| - [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء  قوي في البرمجة. | ||||
|  | ||||
| > [!TIP] | ||||
| > اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct. | ||||
|  | ||||
|  | ||||
|  | ||||
| على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي. | ||||
|  | ||||
| ```py3 | ||||
| >>> agent.run( | ||||
| ...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", | ||||
| ... ) | ||||
| =====New task===== | ||||
| How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need? | ||||
| ====Agent is executing the code below: | ||||
| bert_blocks = search(query="number of blocks in BERT base encoder") | ||||
| print("BERT blocks:", bert_blocks) | ||||
| ==== | ||||
| Print outputs: | ||||
| BERT blocks: twelve encoder blocks | ||||
|  | ||||
| ====Agent is executing the code below: | ||||
| attention_layer = search(query="number of layers in Attention is All You Need") | ||||
| print("Attention layers:", attention_layer) | ||||
| ==== | ||||
| Print outputs: | ||||
| Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture. | ||||
|  | ||||
| ====Agent is executing the code below: | ||||
| bert_blocks = 12 | ||||
| attention_layers = 6 | ||||
| diff = bert_blocks - attention_layers | ||||
| print("Difference in blocks:", diff) | ||||
| final_answer(diff) | ||||
| ==== | ||||
|  | ||||
| Print outputs: | ||||
| Difference in blocks: 6 | ||||
|  | ||||
| Final answer: 6 | ||||
| ``` | ||||
|  | ||||
| ### كيف يمكنني بناء وكيل؟ | ||||
|  | ||||
| لتهيئة وكيل، تحتاج إلى هذه الوسائط: | ||||
|  | ||||
| - نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له. | ||||
| - موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته. | ||||
| - صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها | ||||
| - محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها | ||||
|  | ||||
| عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا. | ||||
|  | ||||
| للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية. | ||||
|  | ||||
| ```bash | ||||
| pip install transformers[agents] | ||||
| ``` | ||||
|  | ||||
| قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد. | ||||
|  | ||||
| ```python | ||||
| from huggingface_hub import login, InferenceClient | ||||
|  | ||||
| login("<YOUR_HUGGINGFACEHUB_API_TOKEN>") | ||||
|  | ||||
| client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct") | ||||
|  | ||||
| def llm_engine(messages, stop_sequences=["Task"]) -> str: | ||||
|     response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) | ||||
|     answer = response.choices[0].message.content | ||||
|     return answer | ||||
| ``` | ||||
|  | ||||
| يمكنك استخدام أي طريقة `llm_engine` طالما أنها: | ||||
| 1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str` | ||||
| 2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop` | ||||
|  | ||||
| أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`. | ||||
|  | ||||
| الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى. | ||||
|  | ||||
| ```python | ||||
| from transformers import CodeAgent, HfEngine | ||||
|  | ||||
| llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct") | ||||
| agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) | ||||
|  | ||||
| agent.run( | ||||
|     "Could you translate this sentence from French, say it out loud and return the audio.", | ||||
|     sentence="Où est la boulangerie la plus proche?", | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي. | ||||
|  | ||||
| ```python | ||||
| from transformers import CodeAgent | ||||
|  | ||||
| agent = CodeAgent(tools=[], add_base_tools=True) | ||||
|  | ||||
| agent.run( | ||||
|     "Could you translate this sentence from French, say it out loud and give me the audio.", | ||||
|     sentence="Où est la boulangerie la plus proche?", | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج. | ||||
|  | ||||
| يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها: | ||||
|  | ||||
| ```py | ||||
| from transformers import ReactCodeAgent | ||||
|  | ||||
| agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) | ||||
|  | ||||
| agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3") | ||||
| ``` | ||||
|  | ||||
|  | ||||
| تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك. | ||||
|  | ||||
| ```python | ||||
| print(agent.system_prompt_template) | ||||
| ``` | ||||
|  | ||||
| من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها. | ||||
| كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا. | ||||
| يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`. | ||||
|  | ||||
|  | ||||
| #### تنفيذ التعليمات البرمجية | ||||
|  | ||||
| يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك. | ||||
| يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه. | ||||
|  | ||||
| مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة. | ||||
| يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل  `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import ReactCodeAgent | ||||
|  | ||||
| >>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4']) | ||||
| >>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") | ||||
|  | ||||
| (...) | ||||
| 'Hugging Face – Blog' | ||||
| ``` | ||||
|  | ||||
| سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل. | ||||
|  | ||||
| > [!WARNING] | ||||
| > يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة! | ||||
|  | ||||
| ### موجه النظام | ||||
|  | ||||
| ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً). | ||||
|  | ||||
| ```text | ||||
| You will be given a task to solve as best you can. | ||||
| You have access to the following tools: | ||||
| <<tool_descriptions>> | ||||
|  | ||||
| To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. | ||||
|  | ||||
| At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. | ||||
| Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence. | ||||
| During each intermediate step, you can use 'print()' to save whatever important information you will then need. | ||||
| These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. | ||||
|  | ||||
| In the end you have to return a final answer using the `final_answer` tool. | ||||
|  | ||||
| Here are a few examples using notional tools: | ||||
| --- | ||||
| {examples} | ||||
|  | ||||
| Above example were using notional tools that might not exist for you. You only have acces to those tools: | ||||
| <<tool_names>> | ||||
| You also can perform computations in the python code you generate. | ||||
|  | ||||
| Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward. | ||||
|  | ||||
| Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks. | ||||
| Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result. | ||||
|  | ||||
| Remember to make sure that variables you use are all defined. | ||||
|  | ||||
| Now Begin! | ||||
| ``` | ||||
|  | ||||
| يتضمن موجه النظام: | ||||
| - *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها. | ||||
| - وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<<tool_descriptions>>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها. | ||||
|     - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه. | ||||
| - شكل المخرج المتوقع. | ||||
|  | ||||
| يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات. | ||||
|  | ||||
| للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`. | ||||
|  | ||||
| ```python | ||||
| from transformers import ReactJsonAgent | ||||
| from transformers.agents import PythonInterpreterTool | ||||
|  | ||||
| agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}") | ||||
| ``` | ||||
|  | ||||
| > [!WARNING] | ||||
| > يرجى التأكد من تحديد سلسلة `<<tool_descriptions>>` في مكان ما في `template` حتى يكون الوكيل على علم  | ||||
| بالأدوات المتاحة. | ||||
|  | ||||
|  | ||||
| ### فحص تشغيل الوكيل | ||||
|  | ||||
| فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل: | ||||
| - تخزن  `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`. | ||||
| - تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة. | ||||
|  | ||||
| ## الأدوات | ||||
|  | ||||
| الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة. | ||||
|  | ||||
| يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة. | ||||
|  | ||||
| عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا. | ||||
|  | ||||
| ### صندوق الأدوات الافتراضي | ||||
|  | ||||
| يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`: | ||||
|  | ||||
| - **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut)) | ||||
| - **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt)) | ||||
| - **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper)) | ||||
| - **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5)) | ||||
| - **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف. | ||||
| - **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python | ||||
| لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS: | ||||
|  | ||||
| يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها. | ||||
|  | ||||
| ```python | ||||
| from transformers import load_tool | ||||
|  | ||||
| tool = load_tool("text-to-speech") | ||||
| audio = tool("This is a text to speech tool") | ||||
| ``` | ||||
|  | ||||
| ### إنشاء أداة جديدة | ||||
|  | ||||
| يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face. | ||||
| على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub. | ||||
|  | ||||
| سوف نبدأ بالكود التالي. | ||||
|  | ||||
| ```python | ||||
| from huggingface_hub import list_models | ||||
|  | ||||
| task = "text-classification" | ||||
|  | ||||
| model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) | ||||
| print(model.id) | ||||
| ``` | ||||
|  | ||||
| يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`]. | ||||
|  | ||||
| تحتاج الأداة المخصصة إلى: | ||||
|  | ||||
| - اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`. | ||||
| - تستخدم خاصية `description` لملء موجه نظام الوكيل. | ||||
| - خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات. | ||||
| - خاصية `output_type`، والتي تحدد نوع المخرج. | ||||
| - طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية. | ||||
|  | ||||
| ```python | ||||
| from transformers import Tool | ||||
| from huggingface_hub import list_models | ||||
|  | ||||
| class HFModelDownloadsTool(Tool): | ||||
|     name = "model_download_counter" | ||||
|     description = ( | ||||
|         "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " | ||||
|         "It returns the name of the checkpoint." | ||||
|     ) | ||||
|  | ||||
|     inputs = { | ||||
|         "task": { | ||||
|             "type": "text", | ||||
|             "description": "the task category (such as text-classification, depth-estimation, etc)", | ||||
|         } | ||||
|     } | ||||
|     output_type = "text" | ||||
|  | ||||
|     def forward(self, task: str): | ||||
|         model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) | ||||
|         return model.id | ||||
| ``` | ||||
|  | ||||
| الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام. | ||||
|  | ||||
| ```python | ||||
| from model_downloads import HFModelDownloadsTool | ||||
|  | ||||
| tool = HFModelDownloadsTool() | ||||
| ``` | ||||
|  | ||||
| يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة. | ||||
|  | ||||
| ```python | ||||
| tool.push_to_hub("{your_username}/hf-model-downloads") | ||||
| ``` | ||||
|  | ||||
| قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك. | ||||
|  | ||||
| ```python | ||||
| from transformers import load_tool, CodeAgent | ||||
|  | ||||
| model_download_tool = load_tool("m-ric/hf-model-downloads") | ||||
| agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine) | ||||
| agent.run( | ||||
|     "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| ستحصل على ما يلي: | ||||
|  | ||||
| ```text | ||||
| ======== New task ======== | ||||
| Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub? | ||||
| ==== Agent is executing the code below: | ||||
| most_downloaded_model = model_download_counter(task="text-to-video") | ||||
| print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.") | ||||
| ==== | ||||
| ``` | ||||
|  | ||||
| والناتج: | ||||
|  | ||||
| `"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."` | ||||
|  | ||||
| ### إدارة صندوق أدوات الوكيل الخاص بك | ||||
|  | ||||
| إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة. | ||||
|  | ||||
| دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي. | ||||
|  | ||||
| ```python | ||||
| from transformers import CodeAgent | ||||
|  | ||||
| agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) | ||||
| agent.toolbox.add_tool(model_download_tool) | ||||
| ``` | ||||
|  | ||||
| الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة: | ||||
|  | ||||
| ```python | ||||
|     agent.run( | ||||
|         "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?" | ||||
|     ) | ||||
| ``` | ||||
|  | ||||
| | **Audio**                                                                                                                                            | | ||||
| |------------------------------------------------------------------------------------------------------------------------------------------------------| | ||||
| | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> | | ||||
|  | ||||
| > [!WARNING] | ||||
| > احترس عند إضافة أدوات إلى وكيل يعمل بالفعل لأنه يمكن أن يؤثر على اختيار الأداة لصالح أداتك أو اختيار أداة أخرى غير المحددة بالفعل. | ||||
|  | ||||
| استخدم طريقة `agent.toolbox.update_tool()` لاستبدال أداة موجودة في صندوق أدوات الوكيل. | ||||
| هذا مفيد إذا كانت أداتك الجديدة بديلاً مباشرًا للأداة الموجودة لأن الوكيل يعرف بالفعل كيفية تنفيذ تلك المهمة المحددة. | ||||
| تأكد فقط من اتباع الأداة الجديدة لنفس واجهة برمجة التطبيقات (API) للأداة المستبدلة أو قم بتكييف قالب موجه النظام لضمان تحديث جميع الأمثلة التي تستخدم الأداة المستبدلة. | ||||
|  | ||||
| ### استخدام مجموعة من الأدوات | ||||
|  | ||||
| يمكنك الاستفادة من مجموعات الأدوات باستخدام كائن ToolCollection، مع تحديد مجموعة الأدوات التي تريد استخدامها. | ||||
| ثم قم بتمريرها كقائمة لتهيئة الوكيل الخاص بك، وبدء استخدامها! | ||||
|  | ||||
| ```py | ||||
| from transformers import ToolCollection, ReactCodeAgent | ||||
|  | ||||
| image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f") | ||||
| agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True) | ||||
|  | ||||
| agent.run("Please draw me a picture of rivers and lakes.") | ||||
| ``` | ||||
|  | ||||
| لتسريع البداية، يتم تحميل الأدوات فقط إذا استدعاها الوكيل. | ||||
|  | ||||
| ستحصل على هذه الصورة: | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" /> | ||||
|  | ||||
| ### استخدام gradio-tools | ||||
|  | ||||
| [gradio-tools](https://github.com/freddyaboulton/gradio-tools) هي مكتبة قوية تتيح استخدام Hugging | ||||
| Face Spaces كأدوات. تدعم العديد من المساحات الموجودة بالإضافة إلى مساحات مخصصة. | ||||
|  | ||||
| تدعم مكتبة Transformers `gradio_tools` باستخدام طريقة [`Tool.from_gradio`] في الفئة. على سبيل المثال، دعنا نستخدم [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) من مجموعة أدوات `gradio-tools` لتحسين المطالبات لإنشاء صور أفضل. | ||||
|  | ||||
| استورد وقم بتهيئة الأداة، ثم مررها إلى طريقة `Tool.from_gradio`: | ||||
|  | ||||
| ```python | ||||
| from gradio_tools import StableDiffusionPromptGeneratorTool | ||||
| from transformers import Tool, load_tool, CodeAgent | ||||
|  | ||||
| gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool() | ||||
| prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool) | ||||
| ``` | ||||
|  | ||||
| الآن يمكنك استخدامه مثل أي أداة أخرى. على سبيل المثال، دعنا نحسن الموجه `a rabbit wearing a space suit`. | ||||
|  | ||||
| ```python | ||||
| image_generation_tool = load_tool('huggingface-tools/text-to-image') | ||||
| agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine) | ||||
|  | ||||
| agent.run( | ||||
|     "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit' | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| يستفيد النموذج بشكل كافٍ من الأداة: | ||||
|  | ||||
| ```text | ||||
| ======== New task ======== | ||||
| Improve this prompt, then generate an image of it. | ||||
| You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}. | ||||
| ==== Agent is executing the code below: | ||||
| improved_prompt = StableDiffusionPromptGenerator(query=prompt) | ||||
| while improved_prompt == "QUEUE_FULL": | ||||
|     improved_prompt = StableDiffusionPromptGenerator(query=prompt) | ||||
| print(f"The improved prompt is {improved_prompt}.") | ||||
| image = image_generator(prompt=improved_prompt) | ||||
| ==== | ||||
| ``` | ||||
|  | ||||
| قبل إنشاء الصورة أخيرًا: | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp" /> | ||||
|  | ||||
| > [!WARNING] | ||||
| > تتطلب gradio-tools إدخالات وإخراجات *نصية* حتى عند العمل مع طرائق مختلفة مثل كائنات الصور والصوت. الإدخالات والإخراجات الصورية والصوتية غير متوافقة حاليًا. | ||||
|  | ||||
| ### استخدام أدوات LangChain | ||||
|  | ||||
| نحن نحب Langchain ونعتقد أنها تحتوي على مجموعة أدوات قوية للغاية. | ||||
| لاستيراد أداة من LangChain، استخدم الطريقة `from_langchain()`. | ||||
|  | ||||
| فيما يلي كيفية استخدامها لإعادة إنشاء نتيجة البحث في المقدمة باستخدام أداة بحث الويب LangChain. | ||||
|  | ||||
| ```python | ||||
| from langchain.agents import load_tools | ||||
| from transformers import Tool, ReactCodeAgent | ||||
|  | ||||
| search_tool = Tool.from_langchain(load_tools(["serpapi"])[0]) | ||||
|  | ||||
| agent = ReactCodeAgent(tools=[search_tool]) | ||||
|  | ||||
| agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?") | ||||
| ``` | ||||
|  | ||||
| ## واجهة Gradio | ||||
|  | ||||
| يمكنك الاستفادة من `gradio.Chatbot` لعرض أفكار الوكيل الخاص بك باستخدام `stream_to_gradio`، إليك مثال: | ||||
|  | ||||
| ```py | ||||
| import gradio as gr | ||||
| from transformers import ( | ||||
|     load_tool, | ||||
|     ReactCodeAgent, | ||||
|     HfEngine, | ||||
|     stream_to_gradio, | ||||
| ) | ||||
|  | ||||
| # Import tool from Hub | ||||
| image_generation_tool = load_tool("m-ric/text-to-image") | ||||
|  | ||||
| llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct") | ||||
|  | ||||
| # Initialize the agent with the image generation tool | ||||
| agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine) | ||||
|  | ||||
|  | ||||
| def interact_with_agent(task): | ||||
|     messages = [] | ||||
|     messages.append(gr.ChatMessage(role="user", content=task)) | ||||
|     yield messages | ||||
|     for msg in stream_to_gradio(agent, task): | ||||
|         messages.append(msg) | ||||
|         yield messages + [ | ||||
|             gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!") | ||||
|         ] | ||||
|     yield messages | ||||
|  | ||||
|  | ||||
| with gr.Blocks() as demo: | ||||
|     text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.") | ||||
|     submit = gr.Button("Run illustrator agent!") | ||||
|     chatbot = gr.Chatbot( | ||||
|         label="Agent", | ||||
|         type="messages", | ||||
|         avatar_images=( | ||||
|             None, | ||||
|             "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png", | ||||
|         ), | ||||
|     ) | ||||
|     submit.click(interact_with_agent, [text_input], [chatbot]) | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     demo.launch() | ||||
| ``` | ||||
| @ -15,4 +15,4 @@ | ||||
| - الوصول إلى جميع أوزان الانتباه لكل رأس في BERT/GPT/GPT-2، | ||||
| - استرجاع قيم ومشتقات  مخرجات الرأس لحساب درجة أهمية الرأس وحذفه كما هو موضح في https://arxiv.org/abs/1905.10650. | ||||
|  | ||||
| ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE. | ||||
| ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE. | ||||
| @ -77,7 +77,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename) | ||||
|  | ||||
| الآن لديك إمكانية الوصول إلى النسخة الكامل غير المكممة للنموذج في بيئة PyTorch، حيث يمكنك دمجه مع مجموعة كبيرة من الأدوات الأخرى. | ||||
|  | ||||
| لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) من llama.cpp. | ||||
| لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) من llama.cpp. | ||||
|  | ||||
| فيما يلي كيفية إكمال البرنامج النصي أعلاه لحفظ النموذج وإعادة تصديره مرة أخرى إلى `gguf`: | ||||
|  | ||||
|  | ||||
| @ -2,7 +2,7 @@ | ||||
|  | ||||
| بالإضافة إلى دفاتر الملاحظات [notebooks](./notebooks) الخاصة بـ 🤗 Transformers، هناك أيضًا نصوص برمجية توضيحية تُظهر كيفية تدريب نموذج لمهمة باستخدام [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch) أو [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) أو [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). | ||||
|  | ||||
| كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers-research-projects/) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة. | ||||
| كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers/tree/main/examples/research_projects) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة. | ||||
|  | ||||
| لا يُتوقع أن تعمل النصوص البرمجية التوضيحية بشكل مباشر على كل مشكلة، وقد تحتاج إلى تكييف النص البرمجي مع المشكلة التي تحاول حلها. ولمساعدتك في ذلك، تعرض معظم النصوص البرمجية كيفية معالجة البيانات قبل التدريب بشكل كامل، مما يتيح لك تحريرها حسب الحاجة لحالتك الاستخدام. | ||||
|  | ||||
|  | ||||
| @ -116,11 +116,11 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s | ||||
|  | ||||
| <Tip warning={true}> | ||||
|  | ||||
| لم يعد يتم دعم `transformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة. | ||||
| لم يعد يتم دعم `tranformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية: | ||||
| لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `tranformers.onnx`، ثبّت التبعيات الإضافية: | ||||
|  | ||||
| ```bash | ||||
| pip install transformers[onnx] | ||||
|  | ||||
| @ -674,7 +674,29 @@ use_cpu: false | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="Tensor Parallelism with PyTorch 2"> | ||||
|  | ||||
| ```yml | ||||
| compute_environment: LOCAL_MACHINE | ||||
| tp_config: | ||||
|   tp_size: 4 | ||||
| distributed_type: TP | ||||
| downcast_bf16: 'no' | ||||
| machine_rank: 0 | ||||
| main_training_function: main | ||||
| mixed_precision: 'no' | ||||
| num_machines: 1 | ||||
| num_processes: 4 | ||||
| rdzv_backend: static | ||||
| same_network: true | ||||
| tpu_env: [] | ||||
| tpu_use_cluster: false | ||||
| tpu_use_sudo: false | ||||
| use_cpu: false | ||||
|  | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
| يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`. | ||||
|  | ||||
|  | ||||
| @ -23,6 +23,8 @@ | ||||
|     title: Laden und Trainieren von Adaptern mit 🤗 PEFT | ||||
|   - local: model_sharing | ||||
|     title: Ein Modell teilen | ||||
|   - local: transformers_agents | ||||
|     title: Agents | ||||
|   - local: llm_tutorial | ||||
|     title: Generation with LLMs | ||||
|   title: Tutorials | ||||
|  | ||||
| @ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d | ||||
| ein bestehendes Modell: | ||||
|  | ||||
| ```bash | ||||
| transformers add-new-model-like | ||||
| transformers-cli add-new-model-like | ||||
| ``` | ||||
|  | ||||
| Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben. | ||||
|  | ||||
| @ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb | ||||
| Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus: | ||||
|  | ||||
| ```bash | ||||
| transformers env | ||||
| transformers-cli env | ||||
| ``` | ||||
|  | ||||
| Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen: | ||||
|  | ||||
| @ -88,7 +88,7 @@ Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen, | ||||
| 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. | ||||
| 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. | ||||
| 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. | ||||
| 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT. | ||||
| 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. | ||||
| 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. | ||||
| 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. | ||||
| 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. | ||||
|  | ||||
| @ -156,7 +156,7 @@ Die [`pipeline`] kann jedes Modell aus dem [Model Hub](https://huggingface.co/mo | ||||
|  | ||||
| <frameworkcontent> | ||||
| <pt> | ||||
| Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `AutoClass` below): | ||||
| Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below): | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification | ||||
| @ -166,7 +166,7 @@ Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the | ||||
| ``` | ||||
| </pt> | ||||
| <tf> | ||||
| Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `TFAutoClass` below): | ||||
| Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below): | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification | ||||
| @ -222,7 +222,7 @@ Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als | ||||
| Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält: | ||||
|  | ||||
| * [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token. | ||||
| * [attention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen. | ||||
| * [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen. | ||||
|  | ||||
| Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben: | ||||
|  | ||||
|  | ||||
| @ -18,7 +18,7 @@ rendered properly in your Markdown viewer. | ||||
|  | ||||
| Neben den 🤗 Transformers [notebooks](./notebooks) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert. | ||||
|  | ||||
| Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers-research-projects/) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist. | ||||
| Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist. | ||||
|  | ||||
| Es wird nicht erwartet, dass die Beispielskripte bei jedem Problem sofort funktionieren. Möglicherweise müssen Sie das Skript an das Problem anpassen, das Sie zu lösen versuchen. Um Ihnen dabei zu helfen, legen die meisten Skripte vollständig offen, wie die Daten vorverarbeitet werden, so dass Sie sie nach Bedarf für Ihren Anwendungsfall bearbeiten können. | ||||
|  | ||||
|  | ||||
							
								
								
									
										323
									
								
								docs/source/de/transformers_agents.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										323
									
								
								docs/source/de/transformers_agents.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,323 @@ | ||||
| <!--Copyright 2023 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Transformers Agents | ||||
|  | ||||
| <Tip warning={true}> | ||||
|  | ||||
| Transformers Agents ist eine experimentelle API, die jederzeit geändert werden kann. Die von den Agenten zurückgegebenen Ergebnisse | ||||
| zurückgegeben werden, können variieren, da sich die APIs oder die zugrunde liegenden Modelle ändern können. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| Transformers Version v4.29.0, die auf dem Konzept von *Tools* und *Agenten* aufbaut. Sie können damit spielen in | ||||
| [dieses Colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj). | ||||
|  | ||||
| Kurz gesagt, es bietet eine API für natürliche Sprache auf der Grundlage von Transformers: Wir definieren eine Reihe von kuratierten Tools und entwerfen einen  | ||||
| Agenten, um natürliche Sprache zu interpretieren und diese Werkzeuge zu verwenden. Es ist von vornherein erweiterbar; wir haben einige relevante Tools kuratiert,  | ||||
| aber wir werden Ihnen zeigen, wie das System einfach erweitert werden kann, um jedes von der Community entwickelte Tool zu verwenden. | ||||
|  | ||||
| Beginnen wir mit einigen Beispielen dafür, was mit dieser neuen API erreicht werden kann. Sie ist besonders leistungsfähig, wenn es um  | ||||
| Sie ist besonders leistungsstark, wenn es um multimodale Aufgaben geht. Lassen Sie uns also eine Runde drehen, um Bilder zu erzeugen und Text vorzulesen. | ||||
|  | ||||
| ```py | ||||
| agent.run("Caption the following image", image=image) | ||||
| ``` | ||||
|  | ||||
| | **Input**                                                                                                                   | **Output**                        | | ||||
| |-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------| | ||||
| | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water | | ||||
|  | ||||
| --- | ||||
|  | ||||
| ```py | ||||
| agent.run("Read the following text out loud", text=text) | ||||
| ``` | ||||
| | **Input**                                                                                                               | **Output**                                   | | ||||
| |-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------| | ||||
| | A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio> | ||||
|  | ||||
| --- | ||||
|  | ||||
| ```py | ||||
| agent.run( | ||||
|     "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?", | ||||
|     document=document, | ||||
| ) | ||||
| ``` | ||||
| | **Input**                                                                                                                   | **Output**     | | ||||
| |-----------------------------------------------------------------------------------------------------------------------------|----------------| | ||||
| | <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer | | ||||
|  | ||||
| ## Schnellstart | ||||
|  | ||||
| Bevor Sie `agent.run` verwenden können, müssen Sie einen Agenten instanziieren, der ein großes Sprachmodell (LLM) ist.  | ||||
| Wir bieten Unterstützung für openAI-Modelle sowie für OpenSource-Alternativen von BigCode und OpenAssistant. Die openAI | ||||
| Modelle sind leistungsfähiger (erfordern aber einen openAI-API-Schlüssel, können also nicht kostenlos verwendet werden); Hugging Face | ||||
| bietet kostenlosen Zugang zu Endpunkten für BigCode- und OpenAssistant-Modelle. | ||||
|  | ||||
| To start with, please install the `agents` extras in order to install all default dependencies. | ||||
| ```bash | ||||
| pip install transformers[agents] | ||||
| ``` | ||||
|  | ||||
| Um openAI-Modelle zu verwenden, instanziieren Sie einen [`OpenAiAgent`], nachdem Sie die `openai`-Abhängigkeit installiert haben: | ||||
|  | ||||
| ```bash | ||||
| pip install openai | ||||
| ``` | ||||
|  | ||||
|  | ||||
| ```py | ||||
| from transformers import OpenAiAgent | ||||
|  | ||||
| agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>") | ||||
| ``` | ||||
|  | ||||
| Um BigCode oder OpenAssistant zu verwenden, melden Sie sich zunächst an, um Zugriff auf die Inference API zu erhalten: | ||||
|  | ||||
| ```py | ||||
| from huggingface_hub import login | ||||
|  | ||||
| login("<YOUR_TOKEN>") | ||||
| ``` | ||||
|  | ||||
| Dann instanziieren Sie den Agenten | ||||
|  | ||||
| ```py | ||||
| from transformers import HfAgent | ||||
|  | ||||
| # Starcoder | ||||
| agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") | ||||
| # StarcoderBase | ||||
| # agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase") | ||||
| # OpenAssistant | ||||
| # agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5") | ||||
| ``` | ||||
|  | ||||
| Dies geschieht mit der Inferenz-API, die Hugging Face derzeit kostenlos zur Verfügung stellt. Wenn Sie Ihren eigenen Inferenz | ||||
| Endpunkt für dieses Modell (oder einen anderen) haben, können Sie die obige URL durch Ihren URL-Endpunkt ersetzen. | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| StarCoder und OpenAssistant sind kostenlos und leisten bei einfachen Aufgaben bewundernswert gute Arbeit. Allerdings halten die Kontrollpunkte | ||||
| nicht, wenn es um komplexere Aufforderungen geht. Wenn Sie mit einem solchen Problem konfrontiert sind, empfehlen wir Ihnen, das OpenAI | ||||
| Modell auszuprobieren, das zwar leider nicht quelloffen ist, aber zur Zeit eine bessere Leistung erbringt. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| Sie sind jetzt startklar! Lassen Sie uns in die beiden APIs eintauchen, die Ihnen jetzt zur Verfügung stehen. | ||||
|  | ||||
| ### Einzelne Ausführung (run) | ||||
|  | ||||
| Die Methode der einmaligen Ausführung ist die Verwendung der [`~Agent.run`] Methode des Agenten: | ||||
|  | ||||
| ```py | ||||
| agent.run("Draw me a picture of rivers and lakes.") | ||||
| ``` | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> | ||||
|  | ||||
| Es wählt automatisch das (oder die) Werkzeug(e) aus, das (die) für die von Ihnen gewünschte Aufgabe geeignet ist (sind) und führt es (sie) entsprechend aus. Es | ||||
| kann eine oder mehrere Aufgaben in der gleichen Anweisung ausführen (je komplexer Ihre Anweisung ist, desto wahrscheinlicher ist ein | ||||
| der Agent scheitern). | ||||
|  | ||||
| ```py | ||||
| agent.run("Draw me a picture of the sea then transform the picture to add an island") | ||||
| ``` | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200> | ||||
|  | ||||
| <br/> | ||||
|  | ||||
|  | ||||
| Jede [`~Agent.run`] Operation ist unabhängig, so dass Sie sie mehrmals hintereinander mit unterschiedlichen Aufgaben ausführen können. | ||||
|  | ||||
| Beachten Sie, dass Ihr `Agent` nur ein großsprachiges Modell ist, so dass kleine Variationen in Ihrer Eingabeaufforderung völlig unterschiedliche Ergebnisse liefern können. | ||||
| unterschiedliche Ergebnisse liefern. Es ist wichtig, dass Sie die Aufgabe, die Sie ausführen möchten, so genau wie möglich erklären. Wir gehen noch weiter ins Detail | ||||
| wie man gute Prompts schreibt [hier](custom_tools#writing-good-user-inputs). | ||||
|  | ||||
| Wenn Sie einen Status über Ausführungszeiten hinweg beibehalten oder dem Agenten Nicht-Text-Objekte übergeben möchten, können Sie dies tun, indem Sie | ||||
| Variablen, die der Agent verwenden soll. Sie könnten zum Beispiel das erste Bild von Flüssen und Seen erzeugen,  | ||||
| und das Modell bitten, dieses Bild zu aktualisieren und eine Insel hinzuzufügen, indem Sie Folgendes tun: | ||||
|  | ||||
| ```python | ||||
| picture = agent.run("Generate a picture of rivers and lakes.") | ||||
| updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture) | ||||
| ``` | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| Dies kann hilfreich sein, wenn das Modell Ihre Anfrage nicht verstehen kann und die Werkzeuge verwechselt. Ein Beispiel wäre: | ||||
|  | ||||
| ```py | ||||
| agent.run("Draw me the picture of a capybara swimming in the sea") | ||||
| ``` | ||||
|  | ||||
| Hier könnte das Modell auf zwei Arten interpretieren: | ||||
| - Die Funktion `Text-zu-Bild` erzeugt ein Wasserschwein, das im Meer schwimmt. | ||||
| - Oder Sie lassen das `Text-zu-Bild` ein Wasserschwein erzeugen und verwenden dann das Werkzeug `Bildtransformation`, um es im Meer schwimmen zu lassen. | ||||
|  | ||||
| Falls Sie das erste Szenario erzwingen möchten, können Sie dies tun, indem Sie die Eingabeaufforderung als Argument übergeben: | ||||
|  | ||||
| ```py | ||||
| agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea") | ||||
| ``` | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
|  | ||||
| ### Chat-basierte Ausführung (Chat) | ||||
|  | ||||
| Der Agent verfügt auch über einen Chat-basierten Ansatz, der die Methode [`~Agent.chat`] verwendet: | ||||
|  | ||||
| ```py | ||||
| agent.chat("Generate a picture of rivers and lakes") | ||||
| ``` | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>  | ||||
|  | ||||
| ```py | ||||
| agent.chat("Transform the picture so that there is a rock in there") | ||||
| ``` | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200> | ||||
|  | ||||
| <br/> | ||||
|  | ||||
| Dies ist ein interessanter Ansatz, wenn Sie den Zustand über Anweisungen hinweg beibehalten möchten. Er ist besser für Experimente geeignet,  | ||||
| eignet sich aber eher für einzelne Anweisungen als für komplexe Anweisungen (die die [`~Agent.run`] | ||||
| Methode besser verarbeiten kann). | ||||
|  | ||||
| Diese Methode kann auch Argumente entgegennehmen, wenn Sie Nicht-Text-Typen oder bestimmte Aufforderungen übergeben möchten. | ||||
|  | ||||
| ### ⚠️ Fernausführung | ||||
|  | ||||
| Zu Demonstrationszwecken und damit es mit allen Setups verwendet werden kann, haben wir Remote-Executors für mehrere  | ||||
| der Standard-Tools erstellt, auf die der Agent in dieser Version Zugriff hat. Diese werden erstellt mit  | ||||
| [inference endpoints](https://huggingface.co/inference-endpoints). | ||||
|  | ||||
| Wir haben diese vorerst deaktiviert, aber um zu sehen, wie Sie selbst Remote Executors Tools einrichten können, | ||||
| empfehlen wir die Lektüre des [custom tool guide](./custom_tools). | ||||
|  | ||||
| ### Was passiert hier? Was sind Tools und was sind Agenten? | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png"> | ||||
|  | ||||
| #### Agenten | ||||
|  | ||||
| Der "Agent" ist hier ein großes Sprachmodell, das wir auffordern, Zugang zu einem bestimmten Satz von Tools zu erhalten. | ||||
|  | ||||
| LLMs sind ziemlich gut darin, kleine Codeproben zu erzeugen. Diese API macht sich das zunutze, indem sie das  | ||||
| LLM ein kleines Codebeispiel gibt, das eine Aufgabe mit einer Reihe von Werkzeugen ausführt. Diese Aufforderung wird dann ergänzt durch die  | ||||
| Aufgabe, die Sie Ihrem Agenten geben, und die Beschreibung der Werkzeuge, die Sie ihm geben. Auf diese Weise erhält er Zugriff auf die Dokumentation der  | ||||
| Tools, insbesondere die erwarteten Eingaben und Ausgaben, und kann den entsprechenden Code generieren. | ||||
|  | ||||
| #### Tools | ||||
|  | ||||
| Tools sind sehr einfach: Sie bestehen aus einer einzigen Funktion mit einem Namen und einer Beschreibung. Wir verwenden dann die Beschreibungen dieser Tools  | ||||
| um den Agenten aufzufordern. Anhand der Eingabeaufforderung zeigen wir dem Agenten, wie er die Tools nutzen kann, um das zu tun, was in der  | ||||
| in der Abfrage angefordert wurde. | ||||
|  | ||||
| Dies geschieht mit brandneuen Tools und nicht mit Pipelines, denn der Agent schreibt besseren Code mit sehr atomaren Tools.  | ||||
| Pipelines sind stärker refaktorisiert und fassen oft mehrere Aufgaben in einer einzigen zusammen. Tools sind dafür gedacht, sich auf | ||||
| eine einzige, sehr einfache Aufgabe konzentrieren. | ||||
|  | ||||
| #### Code-Ausführung?! | ||||
|  | ||||
| Dieser Code wird dann mit unserem kleinen Python-Interpreter auf den mit Ihren Tools übergebenen Eingaben ausgeführt.  | ||||
| Wir hören Sie schon schreien "Willkürliche Codeausführung!", aber lassen Sie uns erklären, warum das nicht der Fall ist. | ||||
|  | ||||
| Die einzigen Funktionen, die aufgerufen werden können, sind die von Ihnen zur Verfügung gestellten Tools und die Druckfunktion, so dass Sie bereits eingeschränkt sind  | ||||
| eingeschränkt, was ausgeführt werden kann. Sie sollten sicher sein, wenn es sich auf die Werkzeuge für das Umarmungsgesicht beschränkt.  | ||||
|  | ||||
| Dann lassen wir keine Attributsuche oder Importe zu (die ohnehin nicht benötigt werden, um die  | ||||
| Inputs/Outputs an eine kleine Gruppe von Funktionen), so dass alle offensichtlichen Angriffe (und Sie müssten den LLM  | ||||
| dazu auffordern, sie auszugeben) kein Problem darstellen sollten. Wenn Sie auf Nummer sicher gehen wollen, können Sie die  | ||||
| run()-Methode mit dem zusätzlichen Argument return_code=True ausführen. In diesem Fall gibt der Agent nur den auszuführenden Code  | ||||
| zur Ausführung zurück und Sie können entscheiden, ob Sie ihn ausführen möchten oder nicht. | ||||
|  | ||||
| Die Ausführung bricht bei jeder Zeile ab, in der versucht wird, eine illegale Operation auszuführen, oder wenn ein regulärer Python-Fehler  | ||||
| mit dem vom Agenten generierten Code. | ||||
|  | ||||
| ### Ein kuratierter Satz von Tools | ||||
|  | ||||
| Wir haben eine Reihe von Tools identifiziert, die solche Agenten unterstützen können. Hier ist eine aktualisierte Liste der Tools, die wir integriert haben  | ||||
| in `transformers` integriert haben: | ||||
|  | ||||
| - **Beantwortung von Fragen zu Dokumenten**: Beantworten Sie anhand eines Dokuments (z.B. PDF) im Bildformat eine Frage zu diesem Dokument ([Donut](./model_doc/donut)) | ||||
| - Beantworten von Textfragen**: Geben Sie einen langen Text und eine Frage an, beantworten Sie die Frage im Text ([Flan-T5](./model_doc/flan-t5)) | ||||
| - **Unbedingte Bildunterschriften**: Beschriften Sie das Bild! ([BLIP](./model_doc/blip)) | ||||
| - **Bildfragebeantwortung**: Beantworten Sie bei einem Bild eine Frage zu diesem Bild ([VILT](./model_doc/vilt)) | ||||
| - **Bildsegmentierung**: Geben Sie ein Bild und einen Prompt an und geben Sie die Segmentierungsmaske dieses Prompts aus ([CLIPSeg](./model_doc/clipseg)) | ||||
| - **Sprache in Text**: Geben Sie eine Audioaufnahme einer sprechenden Person an und transkribieren Sie die Sprache in Text ([Whisper](./model_doc/whisper)) | ||||
| - **Text in Sprache**: wandelt Text in Sprache um ([SpeechT5](./model_doc/speecht5)) | ||||
| - **Zero-Shot-Textklassifizierung**: Ermitteln Sie anhand eines Textes und einer Liste von Bezeichnungen, welcher Bezeichnung der Text am ehesten entspricht ([BART](./model_doc/bart)) | ||||
| - **Textzusammenfassung**: fassen Sie einen langen Text in einem oder wenigen Sätzen zusammen ([BART](./model_doc/bart)) | ||||
| - **Übersetzung**: Übersetzen des Textes in eine bestimmte Sprache ([NLLB](./model_doc/nllb)) | ||||
|  | ||||
| Diese Tools sind in Transformatoren integriert und können auch manuell verwendet werden, zum Beispiel: | ||||
|  | ||||
| ```py | ||||
| from transformers import load_tool | ||||
|  | ||||
| tool = load_tool("text-to-speech") | ||||
| audio = tool("This is a text to speech tool") | ||||
| ``` | ||||
|  | ||||
| ### Benutzerdefinierte Tools | ||||
|  | ||||
| Wir haben zwar eine Reihe von Tools identifiziert, sind aber der festen Überzeugung, dass der Hauptwert dieser Implementierung darin besteht  | ||||
| die Möglichkeit, benutzerdefinierte Tools schnell zu erstellen und weiterzugeben. | ||||
|  | ||||
| Indem Sie den Code eines Tools in einen Hugging Face Space oder ein Modell-Repository stellen, können Sie das Tool  | ||||
| direkt mit dem Agenten nutzen. Wir haben ein paar neue Funktionen hinzugefügt  | ||||
| **transformers-agnostic** Tools zur [`huggingface-tools` Organisation](https://huggingface.co/huggingface-tools) hinzugefügt: | ||||
|  | ||||
| - **Text-Downloader**: zum Herunterladen eines Textes von einer Web-URL | ||||
| - **Text zu Bild**: erzeugt ein Bild nach einer Eingabeaufforderung und nutzt dabei stabile Diffusion | ||||
| - **Bildtransformation**: verändert ein Bild anhand eines Ausgangsbildes und einer Eingabeaufforderung, unter Ausnutzung der stabilen pix2pix-Diffusion | ||||
| - **Text zu Video**: Erzeugen eines kleinen Videos nach einer Eingabeaufforderung, unter Verwendung von damo-vilab | ||||
|  | ||||
| Das Text-zu-Bild-Tool, das wir von Anfang an verwendet haben, ist ein Remote-Tool, das sich in  | ||||
| [*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! Wir werden | ||||
| weiterhin solche Tools für diese und andere Organisationen veröffentlichen, um diese Implementierung weiter zu verbessern. | ||||
|  | ||||
| Die Agenten haben standardmäßig Zugriff auf die Tools, die sich auf [*huggingface-tools*](https://huggingface.co/huggingface-tools) befinden. | ||||
| Wie Sie Ihre eigenen Tools schreiben und freigeben können und wie Sie jedes benutzerdefinierte Tool, das sich auf dem Hub befindet, nutzen können, erklären wir in [folgender Anleitung](custom_tools). | ||||
|  | ||||
| ### Code-Erzeugung | ||||
|  | ||||
| Bisher haben wir gezeigt, wie Sie die Agenten nutzen können, um Aktionen für Sie durchzuführen. Der Agent generiert jedoch nur Code | ||||
| den wir dann mit einem sehr eingeschränkten Python-Interpreter ausführen. Falls Sie den generierten Code in einer anderen Umgebung verwenden möchten  | ||||
| einer anderen Umgebung verwenden möchten, können Sie den Agenten auffordern, den Code zusammen mit einer Tooldefinition und genauen Importen zurückzugeben. | ||||
|  | ||||
| Zum Beispiel die folgende Anweisung | ||||
| ```python | ||||
| agent.run("Draw me a picture of rivers and lakes", return_code=True) | ||||
| ``` | ||||
|  | ||||
| gibt den folgenden Code zurück | ||||
|  | ||||
| ```python | ||||
| from transformers import load_tool | ||||
|  | ||||
| image_generator = load_tool("huggingface-tools/text-to-image") | ||||
|  | ||||
| image = image_generator(prompt="rivers and lakes") | ||||
| ``` | ||||
|  | ||||
| die Sie dann selbst ändern und ausführen können. | ||||
| @ -1,232 +1,42 @@ | ||||
| - sections: | ||||
|   - local: index | ||||
|     title: Transformers | ||||
|     title: 🤗 Transformers | ||||
|   - local: quicktour | ||||
|     title: Quick tour | ||||
|   - local: installation | ||||
|     title: Installation | ||||
|   - local: quicktour | ||||
|     title: Quickstart | ||||
|   title: Get started | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|   - sections: | ||||
|     - local: models | ||||
|       title: Loading models | ||||
|     - local: custom_models | ||||
|       title: Customizing models | ||||
|     - local: how_to_hack_models | ||||
|       title: Customizing model components | ||||
|     - local: model_sharing | ||||
|       title: Sharing | ||||
|   - local: add_new_model | ||||
|       title: Adding a new model to Transformers | ||||
|     - local: modular_transformers | ||||
|       title: Modular Transformers | ||||
|     - local: auto_docstring | ||||
|       title: Document your models | ||||
|     - local: task_summary | ||||
|       title: What 🤗 Transformers can do | ||||
|     - local: tasks_explained | ||||
|       title: How 🤗 Transformers solve tasks | ||||
|     - local: model_summary | ||||
|       title: The Transformer model family | ||||
|     - local: attention | ||||
|       title: Attention mechanisms | ||||
|     - local: attention_interface | ||||
|       title: Customizing attention function | ||||
|     title: Models | ||||
|   - sections: | ||||
|     - local: fast_tokenizers | ||||
|       title: Tokenizers | ||||
|     - local: image_processors | ||||
|       title: Image processors | ||||
|     - local: video_processors | ||||
|       title: Video processors | ||||
|     - local: backbones | ||||
|       title: Backbones | ||||
|     - local: feature_extractors | ||||
|       title: Feature extractors | ||||
|     - local: processors | ||||
|       title: Processors | ||||
|     - local: tokenizer_summary | ||||
|       title: Summary of the tokenizers | ||||
|     - local: pad_truncation | ||||
|       title: Padding and truncation | ||||
|     title: Preprocessors | ||||
|   title: Base classes | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|     title: Adding a new model to `transformers` | ||||
|   title: Get started | ||||
| - sections: | ||||
|   - local: pipeline_tutorial | ||||
|       title: Pipeline | ||||
|     - local: pipeline_gradio | ||||
|       title: Machine learning apps | ||||
|     - local: pipeline_webserver | ||||
|       title: Web server inference | ||||
|     - local: add_new_pipeline | ||||
|       title: Adding a new pipeline | ||||
|     title: Pipeline API | ||||
|   - sections: | ||||
|     - local: llm_tutorial | ||||
|       title: Text generation | ||||
|     - local: generation_strategies | ||||
|       title: Generation strategies | ||||
|     - local: generation_features | ||||
|       title: Generation features | ||||
|     - local: tasks/prompting | ||||
|       title: Prompt engineering | ||||
|     - local: llm_optims | ||||
|       title: Optimizing inference | ||||
|     - local: cache_explanation | ||||
|       title: Caching | ||||
|     - local: kv_cache | ||||
|       title: KV cache strategies | ||||
|     - local: serving | ||||
|       title: Serving | ||||
|     - local: llm_tutorial_optimization | ||||
|       title: Getting the most out of LLMs | ||||
|     - local: perplexity | ||||
|       title: Perplexity of fixed-length models | ||||
|     title: LLMs | ||||
|   - sections: | ||||
|     - local: conversations | ||||
|       title: Chat basics | ||||
|     - local: chat_templating | ||||
|       title: Templates | ||||
|     - local: chat_templating_multimodal | ||||
|       title: Multimodal templates | ||||
|     - local: chat_templating_writing | ||||
|       title: Template writing | ||||
|     - local: chat_extras | ||||
|       title: Tools and RAG | ||||
|     title: Chat with models | ||||
|   - sections: | ||||
|     - local: perf_torch_compile | ||||
|       title: torch.compile | ||||
|     - local: perf_infer_gpu_one | ||||
|       title: GPU | ||||
|     - local: perf_infer_gpu_multi | ||||
|       title: Distributed GPU inference | ||||
|     - local: perf_infer_cpu | ||||
|       title: CPU | ||||
|     - local: tf_xla | ||||
|       title: XLA | ||||
|     title: Optimization | ||||
|   - local: agents | ||||
|     title: Agents | ||||
|   - local: tools | ||||
|     title: Tools | ||||
|   title: Inference | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|   - sections: | ||||
|     - local: trainer | ||||
|       title: Trainer | ||||
|     title: Run inference with pipelines | ||||
|   - local: autoclass_tutorial | ||||
|     title: Write portable code with AutoClass | ||||
|   - local: preprocessing | ||||
|     title: Preprocess data | ||||
|   - local: training | ||||
|       title: Fine-tuning | ||||
|     - local: optimizers | ||||
|       title: Optimizers | ||||
|     - local: hpo_train | ||||
|       title: Hyperparameter search | ||||
|     title: Trainer API | ||||
|   - sections: | ||||
|     - local: accelerator_selection | ||||
|       title: Accelerator selection | ||||
|     title: Fine-tune a pretrained model | ||||
|   - local: run_scripts | ||||
|     title: Train with a script | ||||
|   - local: accelerate | ||||
|       title: Accelerate | ||||
|     - local: fsdp | ||||
|       title: FullyShardedDataParallel | ||||
|     - local: deepspeed | ||||
|       title: DeepSpeed | ||||
|     - local: debugging | ||||
|       title: Multi-GPU debugging | ||||
|     - local: perf_train_cpu_many | ||||
|       title: Distributed CPUs | ||||
|     - local: perf_train_gpu_many | ||||
|       title: Parallelism methods | ||||
|     title: Distributed training | ||||
|   - sections: | ||||
|     - local: perf_train_gpu_one | ||||
|       title: GPU | ||||
|     - local: perf_train_cpu | ||||
|       title: CPU | ||||
|     - local: perf_train_tpu_tf | ||||
|       title: TPU | ||||
|     - local: perf_train_special | ||||
|       title: Apple Silicon | ||||
|     - local: perf_train_gaudi | ||||
|       title: Intel Gaudi | ||||
|     - local: perf_hardware | ||||
|       title: Build your own machine | ||||
|     title: Hardware | ||||
|     title: Set up distributed training with 🤗 Accelerate | ||||
|   - local: peft | ||||
|     title: PEFT | ||||
|   - local: model_memory_anatomy | ||||
|     title: Model training anatomy | ||||
|   title: Training | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|   - local: quantization/overview | ||||
|     title: Overview | ||||
|   - local: quantization/selecting | ||||
|     title: Selecting a quantization method | ||||
|   - local: quantization/concept_guide | ||||
|     title: Quantization concepts | ||||
|   - local: quantization/aqlm | ||||
|     title: AQLM | ||||
|   - local: quantization/auto_round | ||||
|     title: AutoRound | ||||
|   - local: quantization/awq | ||||
|     title: AWQ | ||||
|   - local: quantization/bitnet | ||||
|     title: BitNet | ||||
|   - local: quantization/bitsandbytes | ||||
|     title: bitsandbytes | ||||
|   - local: quantization/compressed_tensors | ||||
|     title: compressed-tensors | ||||
|   - local: quantization/eetq | ||||
|     title: EETQ | ||||
|   - local: quantization/fbgemm_fp8 | ||||
|     title: FBGEMM | ||||
|   - local: quantization/finegrained_fp8 | ||||
|     title: Fine-grained FP8 | ||||
|   - local: gguf | ||||
|     title: GGUF | ||||
|   - local: quantization/gptq | ||||
|     title: GPTQ | ||||
|   - local: quantization/higgs | ||||
|     title: HIGGS | ||||
|   - local: quantization/hqq | ||||
|     title: HQQ | ||||
|   - local: quantization/optimum | ||||
|     title: Optimum | ||||
|   - local: quantization/quanto | ||||
|     title: Quanto | ||||
|   - local: quantization/quark | ||||
|     title: Quark | ||||
|   - local: quantization/torchao | ||||
|     title: torchao | ||||
|   - local: quantization/spqr | ||||
|     title: SpQR | ||||
|   - local: quantization/vptq | ||||
|     title: VPTQ | ||||
|   - local: quantization/contribute | ||||
|     title: Contribute | ||||
|   title: Quantization | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|   - local: serialization | ||||
|     title: ONNX | ||||
|   - local: tflite | ||||
|     title: LiteRT | ||||
|   - local: executorch | ||||
|     title: ExecuTorch | ||||
|   - local: torchscript | ||||
|     title: TorchScript | ||||
|   title: Export to production | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|   - sections: | ||||
|     title: Load and train adapters with 🤗 PEFT | ||||
|   - local: model_sharing | ||||
|     title: Share your model | ||||
|   - local: agents | ||||
|     title: Agents 101 | ||||
|   - local: agents_advanced | ||||
|     title: Agents, supercharged - Multi-agents, External tools, and more | ||||
|   - local: llm_tutorial | ||||
|     title: Generation with LLMs | ||||
|   - local: conversations | ||||
|     title: Chatting with Transformers | ||||
|   title: Tutorials | ||||
| - sections: | ||||
|   - isExpanded: false | ||||
|     sections: | ||||
|     - local: tasks/sequence_classification | ||||
|       title: Text classification | ||||
|     - local: tasks/token_classification | ||||
| @ -243,14 +53,16 @@ | ||||
|       title: Summarization | ||||
|     - local: tasks/multiple_choice | ||||
|       title: Multiple choice | ||||
|       title: Natural language processing | ||||
|     - sections: | ||||
|     title: Natural Language Processing | ||||
|   - isExpanded: false | ||||
|     sections: | ||||
|     - local: tasks/audio_classification | ||||
|       title: Audio classification | ||||
|     - local: tasks/asr | ||||
|       title: Automatic speech recognition | ||||
|     title: Audio | ||||
|     - sections: | ||||
|   - isExpanded: false | ||||
|     sections: | ||||
|     - local: tasks/image_classification | ||||
|       title: Image classification | ||||
|     - local: tasks/semantic_segmentation | ||||
| @ -272,11 +84,12 @@ | ||||
|     - local: tasks/mask_generation | ||||
|       title: Mask Generation | ||||
|     - local: tasks/keypoint_detection | ||||
|         title: Keypoint detection | ||||
|       title: Keypoint Detection | ||||
|     - local: tasks/knowledge_distillation_for_image_classification | ||||
|       title: Knowledge Distillation for Computer Vision | ||||
|       title: Computer vision | ||||
|     - sections: | ||||
|     title: Computer Vision | ||||
|   - isExpanded: false | ||||
|     sections: | ||||
|     - local: tasks/image_captioning | ||||
|       title: Image captioning | ||||
|     - local: tasks/document_question_answering | ||||
| @ -285,41 +98,197 @@ | ||||
|       title: Visual Question Answering | ||||
|     - local: tasks/text-to-speech | ||||
|       title: Text to speech | ||||
|       - local: tasks/idefics | ||||
|         title: Image tasks with IDEFICS | ||||
|     - local: tasks/image_text_to_text | ||||
|       title: Image-text-to-text | ||||
|     - local: tasks/video_text_to_text | ||||
|       title: Video-text-to-text | ||||
|       - local: tasks/visual_document_retrieval | ||||
|         title: Visual Document Retrieval | ||||
|     title: Multimodal | ||||
|     title: Task recipes | ||||
|   - local: run_scripts | ||||
|     title: Training scripts | ||||
|   - local: glossary | ||||
|     title: Glossary | ||||
|   - local: philosophy | ||||
|     title: Philosophy | ||||
|   - isExpanded: false | ||||
|     sections: | ||||
|     - local: generation_strategies | ||||
|       title: Customize the generation strategy | ||||
|     - local: kv_cache | ||||
|       title: Best Practices for Generation with Cache | ||||
|     title: Generation | ||||
|   - isExpanded: false | ||||
|     sections: | ||||
|     - local: chat_template_basics | ||||
|       title: Getting Started with Chat Templates for Text LLMs | ||||
|     - local: chat_template_multimodal | ||||
|       title: Multimodal Chat Templates for Vision and Audio LLMs | ||||
|     - local: chat_template_tools_and_documents | ||||
|       title: Expanding Chat Templates with Tools and Documents | ||||
|     - local: chat_template_advanced | ||||
|       title: Advanced Usage and Customizing Your Chat Templates | ||||
|     title: Chat Templates | ||||
|   - isExpanded: false | ||||
|     sections: | ||||
|     - local: tasks/idefics | ||||
|       title: Image tasks with IDEFICS | ||||
|     - local: tasks/prompting | ||||
|       title: LLM prompting guide | ||||
|     title: Prompting | ||||
|   title: Task Guides | ||||
| - sections: | ||||
|   - local: fast_tokenizers | ||||
|     title: Use fast tokenizers from 🤗 Tokenizers | ||||
|   - local: multilingual | ||||
|     title: Run inference with multilingual models | ||||
|   - local: create_a_model | ||||
|     title: Use model-specific APIs | ||||
|   - local: custom_models | ||||
|     title: Share a custom model | ||||
|   - local: trainer | ||||
|     title: Trainer | ||||
|   - local: sagemaker | ||||
|     title: Run training on Amazon SageMaker | ||||
|   - local: serialization | ||||
|     title: Export to ONNX | ||||
|   - local: tflite | ||||
|     title: Export to TFLite | ||||
|   - local: torchscript | ||||
|     title: Export to TorchScript | ||||
|   - local: notebooks | ||||
|     title: Notebooks with examples | ||||
|   - local: community | ||||
|     title: Community resources | ||||
|   - local: troubleshooting | ||||
|     title: Troubleshoot | ||||
|   title: Resources | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|   - local: contributing | ||||
|     title: Contribute to Transformers | ||||
|   - local: testing | ||||
|     title: Transformers model tests | ||||
|   - local: pr_checks | ||||
|     title: Pull request checks | ||||
|   title: Contribute | ||||
| - isExpanded: false | ||||
|   sections: | ||||
|   - local: gguf | ||||
|     title: Interoperability with GGUF files | ||||
|   - local: tiktoken | ||||
|     title: Interoperability with TikToken files | ||||
|   - local: modular_transformers | ||||
|     title: Modularity in `transformers` | ||||
|   - local: how_to_hack_models | ||||
|     title: Model Hacking (overwriting a class to your usage) | ||||
|   title: Developer guides | ||||
| - sections: | ||||
|   - local: quantization/overview | ||||
|     title: Getting started | ||||
|   - local: quantization/bitsandbytes | ||||
|     title: bitsandbytes | ||||
|   - local: quantization/gptq | ||||
|     title: GPTQ | ||||
|   - local: quantization/awq | ||||
|     title: AWQ | ||||
|   - local: quantization/aqlm | ||||
|     title: AQLM | ||||
|   - local: quantization/vptq | ||||
|     title: SpQR | ||||
|   - local: quantization/spqr | ||||
|     title: VPTQ | ||||
|   - local: quantization/quanto | ||||
|     title: Quanto | ||||
|   - local: quantization/eetq | ||||
|     title: EETQ | ||||
|   - local: quantization/higgs | ||||
|     title: HIGGS | ||||
|   - local: quantization/hqq | ||||
|     title: HQQ | ||||
|   - local: quantization/fbgemm_fp8 | ||||
|     title: FBGEMM_FP8 | ||||
|   - local: quantization/optimum | ||||
|     title: Optimum | ||||
|   - local: quantization/torchao | ||||
|     title: TorchAO | ||||
|   - local: quantization/bitnet | ||||
|     title: BitNet | ||||
|   - local: quantization/compressed_tensors | ||||
|     title: compressed-tensors | ||||
|   - local: quantization/finegrained_fp8 | ||||
|     title: Fine-grained FP8 | ||||
|   - local: quantization/contribute | ||||
|     title: Contribute new quantization method | ||||
|   title: Quantization Methods | ||||
| - sections: | ||||
|   - local: performance | ||||
|     title: Overview | ||||
|   - local: llm_optims | ||||
|     title: LLM inference optimization | ||||
|   - sections: | ||||
|     - local: perf_train_gpu_one | ||||
|       title: Methods and tools for efficient training on a single GPU | ||||
|     - local: perf_train_gpu_many | ||||
|       title: Multiple GPUs and parallelism | ||||
|     - local: fsdp | ||||
|       title: Fully Sharded Data Parallel | ||||
|     - local: deepspeed | ||||
|       title: DeepSpeed | ||||
|     - local: perf_train_cpu | ||||
|       title: Efficient training on CPU | ||||
|     - local: perf_train_cpu_many | ||||
|       title: Distributed CPU training | ||||
|     - local: perf_train_tpu_tf | ||||
|       title: Training on TPU with TensorFlow | ||||
|     - local: perf_train_special | ||||
|       title: PyTorch training on Apple silicon | ||||
|     - local: perf_hardware | ||||
|       title: Custom hardware for training | ||||
|     - local: hpo_train | ||||
|       title: Hyperparameter Search using Trainer API | ||||
|     title: Efficient training techniques | ||||
|   - sections: | ||||
|     - local: perf_infer_cpu | ||||
|       title: CPU inference | ||||
|     - local: perf_infer_gpu_one | ||||
|       title: GPU inference | ||||
|     - local: perf_infer_gpu_multi | ||||
|       title: Multi-GPU inference | ||||
|     title: Optimizing inference | ||||
|   - local: big_models | ||||
|     title: Instantiate a big model | ||||
|   - local: debugging | ||||
|     title: Debugging | ||||
|   - local: tf_xla | ||||
|     title: XLA Integration for TensorFlow Models | ||||
|   - local: perf_torch_compile | ||||
|     title: Optimize inference using `torch.compile()` | ||||
|   title: Performance and scalability | ||||
| - sections: | ||||
|   - local: contributing | ||||
|     title: How to contribute to 🤗 Transformers? | ||||
|   - local: add_new_model | ||||
|     title: How to add a model to 🤗 Transformers? | ||||
|   - local: add_new_pipeline | ||||
|     title: How to add a pipeline to 🤗 Transformers? | ||||
|   - local: testing | ||||
|     title: Testing | ||||
|   - local: pr_checks | ||||
|     title: Checks on a Pull Request | ||||
|   title: Contribute | ||||
| - sections: | ||||
|   - local: philosophy | ||||
|     title: Philosophy | ||||
|   - local: glossary | ||||
|     title: Glossary | ||||
|   - local: task_summary | ||||
|     title: What 🤗 Transformers can do | ||||
|   - local: tasks_explained | ||||
|     title: How 🤗 Transformers solve tasks | ||||
|   - local: model_summary | ||||
|     title: The Transformer model family | ||||
|   - local: tokenizer_summary | ||||
|     title: Summary of the tokenizers | ||||
|   - local: attention | ||||
|     title: Attention mechanisms | ||||
|   - local: pad_truncation | ||||
|     title: Padding and truncation | ||||
|   - local: bertology | ||||
|     title: BERTology | ||||
|   - local: perplexity | ||||
|     title: Perplexity of fixed-length models | ||||
|   - local: pipeline_webserver | ||||
|     title: Pipelines for webserver inference | ||||
|   - local: model_memory_anatomy | ||||
|     title: Model training anatomy | ||||
|   - local: llm_tutorial_optimization | ||||
|     title: Getting the most out of LLMs | ||||
|   title: Conceptual guides | ||||
| - sections: | ||||
|   - sections: | ||||
|     - local: main_classes/agent | ||||
|       title: Agents and Tools | ||||
|     - local: model_doc/auto | ||||
|       title: Auto Classes | ||||
|     - local: main_classes/backbones | ||||
| @ -344,8 +313,6 @@ | ||||
|       title: Optimization | ||||
|     - local: main_classes/output | ||||
|       title: Model outputs | ||||
|     - local: main_classes/peft | ||||
|       title: PEFT | ||||
|     - local: main_classes/pipelines | ||||
|       title: Pipelines | ||||
|     - local: main_classes/processors | ||||
| @ -364,11 +331,10 @@ | ||||
|       title: Feature Extractor | ||||
|     - local: main_classes/image_processor | ||||
|       title: Image Processor | ||||
|     - local: main_classes/video_processor | ||||
|       title: Video Processor | ||||
|     title: Main Classes | ||||
|   - sections: | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/albert | ||||
|         title: ALBERT | ||||
|       - local: model_doc/bamba | ||||
| @ -386,15 +352,13 @@ | ||||
|       - local: model_doc/bert-japanese | ||||
|         title: BertJapanese | ||||
|       - local: model_doc/bertweet | ||||
|         title: BERTweet | ||||
|         title: Bertweet | ||||
|       - local: model_doc/big_bird | ||||
|         title: BigBird | ||||
|       - local: model_doc/bigbird_pegasus | ||||
|         title: BigBirdPegasus | ||||
|       - local: model_doc/biogpt | ||||
|         title: BioGpt | ||||
|       - local: model_doc/bitnet | ||||
|         title: BitNet | ||||
|       - local: model_doc/blenderbot | ||||
|         title: Blenderbot | ||||
|       - local: model_doc/blenderbot-small | ||||
| @ -431,8 +395,6 @@ | ||||
|         title: DeBERTa | ||||
|       - local: model_doc/deberta-v2 | ||||
|         title: DeBERTa-v2 | ||||
|       - local: model_doc/deepseek_v3 | ||||
|         title: DeepSeek-V3 | ||||
|       - local: model_doc/dialogpt | ||||
|         title: DialoGPT | ||||
|       - local: model_doc/diffllama | ||||
| @ -455,8 +417,6 @@ | ||||
|         title: Falcon | ||||
|       - local: model_doc/falcon3 | ||||
|         title: Falcon3 | ||||
|       - local: model_doc/falcon_h1 | ||||
|         title: FalconH1 | ||||
|       - local: model_doc/falcon_mamba | ||||
|         title: FalconMamba | ||||
|       - local: model_doc/flan-t5 | ||||
| @ -479,8 +439,6 @@ | ||||
|         title: Gemma2 | ||||
|       - local: model_doc/glm | ||||
|         title: GLM | ||||
|       - local: model_doc/glm4 | ||||
|         title: glm4 | ||||
|       - local: model_doc/openai-gpt | ||||
|         title: GPT | ||||
|       - local: model_doc/gpt_neo | ||||
| @ -503,16 +461,14 @@ | ||||
|         title: Granite | ||||
|       - local: model_doc/granitemoe | ||||
|         title: GraniteMoe | ||||
|       - local: model_doc/granitemoehybrid | ||||
|         title: GraniteMoeHybrid | ||||
|       - local: model_doc/granitemoeshared | ||||
|         title: GraniteMoeShared | ||||
|       - local: model_doc/granitevision | ||||
|         title: GraniteVision | ||||
|       - local: model_doc/helium | ||||
|         title: Helium | ||||
|       - local: model_doc/herbert | ||||
|         title: HerBERT | ||||
|       - local: model_doc/hgnet_v2 | ||||
|         title: HGNet-V2 | ||||
|       - local: model_doc/ibert | ||||
|         title: I-BERT | ||||
|       - local: model_doc/jamba | ||||
| @ -542,7 +498,7 @@ | ||||
|       - local: model_doc/mamba | ||||
|         title: Mamba | ||||
|       - local: model_doc/mamba2 | ||||
|         title: Mamba2 | ||||
|         title: mamba2 | ||||
|       - local: model_doc/marian | ||||
|         title: MarianMT | ||||
|       - local: model_doc/markuplm | ||||
| @ -555,8 +511,6 @@ | ||||
|         title: MegatronBERT | ||||
|       - local: model_doc/megatron_gpt2 | ||||
|         title: MegatronGPT2 | ||||
|       - local: model_doc/minimax | ||||
|         title: MiniMax | ||||
|       - local: model_doc/mistral | ||||
|         title: Mistral | ||||
|       - local: model_doc/mixtral | ||||
| @ -623,10 +577,6 @@ | ||||
|         title: Qwen2 | ||||
|       - local: model_doc/qwen2_moe | ||||
|         title: Qwen2MoE | ||||
|       - local: model_doc/qwen3 | ||||
|         title: Qwen3 | ||||
|       - local: model_doc/qwen3_moe | ||||
|         title: Qwen3MoE | ||||
|       - local: model_doc/rag | ||||
|         title: RAG | ||||
|       - local: model_doc/realm | ||||
| @ -694,7 +644,8 @@ | ||||
|       - local: model_doc/zamba2 | ||||
|         title: Zamba2 | ||||
|       title: Text models | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/beit | ||||
|         title: BEiT | ||||
|       - local: model_doc/bit | ||||
| @ -707,8 +658,6 @@ | ||||
|         title: ConvNeXTV2 | ||||
|       - local: model_doc/cvt | ||||
|         title: CvT | ||||
|       - local: model_doc/d_fine | ||||
|         title: D-FINE | ||||
|       - local: model_doc/dab-detr | ||||
|         title: DAB-DETR | ||||
|       - local: model_doc/deformable_detr | ||||
| @ -755,8 +704,6 @@ | ||||
|         title: Mask2Former | ||||
|       - local: model_doc/maskformer | ||||
|         title: MaskFormer | ||||
|       - local: model_doc/mlcd | ||||
|         title: MLCD | ||||
|       - local: model_doc/mobilenet_v1 | ||||
|         title: MobileNetV1 | ||||
|       - local: model_doc/mobilenet_v2 | ||||
| @ -769,8 +716,6 @@ | ||||
|         title: NAT | ||||
|       - local: model_doc/poolformer | ||||
|         title: PoolFormer | ||||
|       - local: model_doc/prompt_depth_anything | ||||
|         title: Prompt Depth Anything | ||||
|       - local: model_doc/pvt | ||||
|         title: Pyramid Vision Transformer (PVT) | ||||
|       - local: model_doc/pvt_v2 | ||||
| @ -828,23 +773,20 @@ | ||||
|       - local: model_doc/zoedepth | ||||
|         title: ZoeDepth | ||||
|       title: Vision models | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/audio-spectrogram-transformer | ||||
|         title: Audio Spectrogram Transformer | ||||
|       - local: model_doc/bark | ||||
|         title: Bark | ||||
|       - local: model_doc/clap | ||||
|         title: CLAP | ||||
|       - local: model_doc/csm | ||||
|         title: CSM | ||||
|       - local: model_doc/dac | ||||
|         title: dac | ||||
|       - local: model_doc/encodec | ||||
|         title: EnCodec | ||||
|       - local: model_doc/fastspeech2_conformer | ||||
|         title: FastSpeech2Conformer | ||||
|       - local: model_doc/granite_speech | ||||
|         title: GraniteSpeech | ||||
|       - local: model_doc/hubert | ||||
|         title: Hubert | ||||
|       - local: model_doc/mctct | ||||
| @ -902,7 +844,8 @@ | ||||
|       - local: model_doc/xlsr_wav2vec2 | ||||
|         title: XLSR-Wav2Vec2 | ||||
|       title: Audio models | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/timesformer | ||||
|         title: TimeSformer | ||||
|       - local: model_doc/videomae | ||||
| @ -910,15 +853,14 @@ | ||||
|       - local: model_doc/vivit | ||||
|         title: ViViT | ||||
|       title: Video models | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/align | ||||
|         title: ALIGN | ||||
|       - local: model_doc/altclip | ||||
|         title: AltCLIP | ||||
|       - local: model_doc/aria | ||||
|         title: Aria | ||||
|       - local: model_doc/aya_vision | ||||
|         title: AyaVision | ||||
|       - local: model_doc/blip | ||||
|         title: BLIP | ||||
|       - local: model_doc/blip-2 | ||||
| @ -939,8 +881,6 @@ | ||||
|         title: CLVP | ||||
|       - local: model_doc/colpali | ||||
|         title: ColPali | ||||
|       - local: model_doc/colqwen2 | ||||
|         title: ColQwen2 | ||||
|       - local: model_doc/data2vec | ||||
|         title: Data2Vec | ||||
|       - local: model_doc/deplot | ||||
| @ -951,14 +891,10 @@ | ||||
|         title: Emu3 | ||||
|       - local: model_doc/flava | ||||
|         title: FLAVA | ||||
|       - local: model_doc/gemma3 | ||||
|         title: Gemma3 | ||||
|       - local: model_doc/git | ||||
|         title: GIT | ||||
|       - local: model_doc/got_ocr2 | ||||
|         title: GOT-OCR2 | ||||
|       - local: model_doc/granitevision | ||||
|         title: GraniteVision | ||||
|       - local: model_doc/grounding-dino | ||||
|         title: Grounding DINO | ||||
|       - local: model_doc/groupvit | ||||
| @ -973,10 +909,6 @@ | ||||
|         title: InstructBLIP | ||||
|       - local: model_doc/instructblipvideo | ||||
|         title: InstructBlipVideo | ||||
|       - local: model_doc/internvl | ||||
|         title: InternVL | ||||
|       - local: model_doc/janus | ||||
|         title: Janus | ||||
|       - local: model_doc/kosmos-2 | ||||
|         title: KOSMOS-2 | ||||
|       - local: model_doc/layoutlm | ||||
| @ -989,8 +921,6 @@ | ||||
|         title: LayoutXLM | ||||
|       - local: model_doc/lilt | ||||
|         title: LiLT | ||||
|       - local: model_doc/llama4 | ||||
|         title: Llama4 | ||||
|       - local: model_doc/llava | ||||
|         title: Llava | ||||
|       - local: model_doc/llava_next | ||||
| @ -1005,8 +935,6 @@ | ||||
|         title: MatCha | ||||
|       - local: model_doc/mgp-str | ||||
|         title: MGP-STR | ||||
|       - local: model_doc/mistral3 | ||||
|         title: Mistral3 | ||||
|       - local: model_doc/mllama | ||||
|         title: mllama | ||||
|       - local: model_doc/nougat | ||||
| @ -1023,14 +951,10 @@ | ||||
|         title: PaliGemma | ||||
|       - local: model_doc/perceiver | ||||
|         title: Perceiver | ||||
|       - local: model_doc/phi4_multimodal | ||||
|         title: Phi4 Multimodal | ||||
|       - local: model_doc/pix2struct | ||||
|         title: Pix2Struct | ||||
|       - local: model_doc/pixtral | ||||
|         title: Pixtral | ||||
|       - local: model_doc/qwen2_5_omni | ||||
|         title: Qwen2.5-Omni | ||||
|       - local: model_doc/qwen2_5_vl | ||||
|         title: Qwen2.5-VL | ||||
|       - local: model_doc/qwen2_audio | ||||
| @ -1039,16 +963,8 @@ | ||||
|         title: Qwen2VL | ||||
|       - local: model_doc/sam | ||||
|         title: Segment Anything | ||||
|       - local: model_doc/sam_hq | ||||
|         title: Segment Anything High Quality | ||||
|       - local: model_doc/shieldgemma2 | ||||
|         title: ShieldGemma2 | ||||
|       - local: model_doc/siglip | ||||
|         title: SigLIP | ||||
|       - local: model_doc/siglip2 | ||||
|         title: SigLIP2 | ||||
|       - local: model_doc/smolvlm | ||||
|         title: SmolVLM | ||||
|       - local: model_doc/speech-encoder-decoder | ||||
|         title: Speech Encoder Decoder Models | ||||
|       - local: model_doc/tapas | ||||
| @ -1076,13 +992,15 @@ | ||||
|       - local: model_doc/xclip | ||||
|         title: X-CLIP | ||||
|       title: Multimodal models | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/decision_transformer | ||||
|         title: Decision Transformer | ||||
|       - local: model_doc/trajectory_transformer | ||||
|         title: Trajectory Transformer | ||||
|       title: Reinforcement learning models | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/autoformer | ||||
|         title: Autoformer | ||||
|       - local: model_doc/informer | ||||
| @ -1093,10 +1011,9 @@ | ||||
|         title: PatchTST | ||||
|       - local: model_doc/time_series_transformer | ||||
|         title: Time Series Transformer | ||||
|       - local: model_doc/timesfm | ||||
|         title: TimesFM | ||||
|       title: Time series models | ||||
|     - sections: | ||||
|     - isExpanded: false | ||||
|       sections: | ||||
|       - local: model_doc/graphormer | ||||
|         title: Graphormer | ||||
|       title: Graph models | ||||
| @ -1104,8 +1021,6 @@ | ||||
|   - sections: | ||||
|     - local: internal/modeling_utils | ||||
|       title: Custom Layers and Utilities | ||||
|     - local: internal/model_debugging_utils | ||||
|       title: Utilities for Model Debugging | ||||
|     - local: internal/pipelines_utils | ||||
|       title: Utilities for pipelines | ||||
|     - local: internal/tokenization_utils | ||||
| @ -1120,14 +1035,7 @@ | ||||
|       title: Utilities for Audio processing | ||||
|     - local: internal/file_utils | ||||
|       title: General Utilities | ||||
|     - local: internal/import_utils | ||||
|       title: Importing Utilities | ||||
|     - local: internal/time_series_utils | ||||
|       title: Utilities for Time Series | ||||
|     title: Internal helpers | ||||
|   - sections: | ||||
|     - local: reference/environment_variables | ||||
|       title: Environment Variables | ||||
|     title: Reference | ||||
|     title: Internal Helpers | ||||
|   title: API | ||||
|  | ||||
|  | ||||
| @ -1,4 +1,4 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
| <!--Copyright 2022 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
| @ -14,152 +14,123 @@ rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Accelerate | ||||
| # Distributed training with 🤗 Accelerate | ||||
|  | ||||
| [Accelerate](https://hf.co/docs/accelerate/index) is a library designed to simplify distributed training on any type of setup with PyTorch by uniting the most common frameworks ([Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/)) for it into a single interface. [`Trainer`] is powered by Accelerate under the hood, enabling loading big models and distributed training. | ||||
| As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment. | ||||
|  | ||||
| This guide will show you two ways to use Accelerate with Transformers, using FSDP as the backend. The first method demonstrates distributed training with [`Trainer`], and the second method demonstrates adapting a PyTorch training loop. For more detailed information about Accelerate, please refer to the [documentation](https://hf.co/docs/accelerate/index). | ||||
| ## Setup | ||||
|  | ||||
| Get started by installing 🤗 Accelerate: | ||||
|  | ||||
| ```bash | ||||
| pip install accelerate | ||||
| ``` | ||||
|  | ||||
| Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-config) in the command line to answer a series of prompts about your training system. This creates and saves a configuration file to help Accelerate correctly set up training based on your setup. | ||||
|  | ||||
| ```bash | ||||
| accelerate config | ||||
| ``` | ||||
|  | ||||
| Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs may look like the following. | ||||
|  | ||||
| ```yaml | ||||
| compute_environment: LOCAL_MACHINE | ||||
| debug: false | ||||
| distributed_type: FSDP | ||||
| downcast_bf16: 'no' | ||||
| fsdp_config: | ||||
|   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP | ||||
|   fsdp_backward_prefetch_policy: BACKWARD_PRE | ||||
|   fsdp_forward_prefetch: false | ||||
|   fsdp_cpu_ram_efficient_loading: true | ||||
|   fsdp_offload_params: false | ||||
|   fsdp_sharding_strategy: FULL_SHARD | ||||
|   fsdp_state_dict_type: SHARDED_STATE_DICT | ||||
|   fsdp_sync_module_states: true | ||||
|   fsdp_transformer_layer_cls_to_wrap: BertLayer | ||||
|   fsdp_use_orig_params: true | ||||
| machine_rank: 0 | ||||
| main_training_function: main | ||||
| mixed_precision: bf16 | ||||
| num_machines: 1 | ||||
| num_processes: 2 | ||||
| rdzv_backend: static | ||||
| same_network: true | ||||
| tpu_env: [] | ||||
| tpu_use_cluster: false | ||||
| tpu_use_sudo: false | ||||
| use_cpu: false | ||||
| ``` | ||||
|  | ||||
| ## Trainer | ||||
|  | ||||
| Pass the path to the saved configuration file to [`TrainingArguments`], and from there, pass your [`TrainingArguments`] to [`Trainer`]. | ||||
| Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device. | ||||
|  | ||||
| ```py | ||||
| from transformers import TrainingArguments, Trainer | ||||
| >>> from accelerate import Accelerator | ||||
|  | ||||
| training_args = TrainingArguments( | ||||
|     output_dir="your-model", | ||||
|     learning_rate=2e-5, | ||||
|     per_device_train_batch_size=16, | ||||
|     per_device_eval_batch_size=16, | ||||
|     num_train_epochs=2, | ||||
|     fsdp_config="path/to/fsdp_config", | ||||
|     fsdp_strategy="full_shard", | ||||
|     weight_decay=0.01, | ||||
|     eval_strategy="epoch", | ||||
|     save_strategy="epoch", | ||||
|     load_best_model_at_end=True, | ||||
|     push_to_hub=True, | ||||
| >>> accelerator = Accelerator() | ||||
| ``` | ||||
|  | ||||
| ## Prepare to accelerate | ||||
|  | ||||
| The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer: | ||||
|  | ||||
| ```py | ||||
| >>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( | ||||
| ...     train_dataloader, eval_dataloader, model, optimizer | ||||
| ... ) | ||||
| ``` | ||||
|  | ||||
| ## Backward | ||||
|  | ||||
| The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`] method: | ||||
|  | ||||
| ```py | ||||
| >>> for epoch in range(num_epochs): | ||||
| ...     for batch in train_dataloader: | ||||
| ...         outputs = model(**batch) | ||||
| ...         loss = outputs.loss | ||||
| ...         accelerator.backward(loss) | ||||
|  | ||||
| ...         optimizer.step() | ||||
| ...         lr_scheduler.step() | ||||
| ...         optimizer.zero_grad() | ||||
| ...         progress_bar.update(1) | ||||
| ``` | ||||
|  | ||||
| As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training! | ||||
|  | ||||
| ```diff | ||||
| + from accelerate import Accelerator | ||||
|   from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler | ||||
|  | ||||
| + accelerator = Accelerator() | ||||
|  | ||||
|   model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) | ||||
|   optimizer = AdamW(model.parameters(), lr=3e-5) | ||||
|  | ||||
| - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | ||||
| - model.to(device) | ||||
|  | ||||
| + train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( | ||||
| +     train_dataloader, eval_dataloader, model, optimizer | ||||
| + ) | ||||
|  | ||||
|   num_epochs = 3 | ||||
|   num_training_steps = num_epochs * len(train_dataloader) | ||||
|   lr_scheduler = get_scheduler( | ||||
|       "linear", | ||||
|       optimizer=optimizer, | ||||
|       num_warmup_steps=0, | ||||
|       num_training_steps=num_training_steps | ||||
|   ) | ||||
|  | ||||
| trainer = Trainer( | ||||
|     model=model, | ||||
|     args=training_args, | ||||
|     train_dataset=dataset["train"], | ||||
|     eval_dataset=dataset["test"], | ||||
|     processing_class=tokenizer, | ||||
|     data_collator=data_collator, | ||||
|     compute_metrics=compute_metrics, | ||||
| ) | ||||
|   progress_bar = tqdm(range(num_training_steps)) | ||||
|  | ||||
| trainer.train() | ||||
| ``` | ||||
|  | ||||
| ## Native PyTorch | ||||
|  | ||||
| Accelerate can also be added to any PyTorch training loop to enable distributed training. The [`~accelerate.Accelerator`] is the main entry point for adapting your PyTorch code to work with Accelerate. It automatically detects your distributed training setup and initializes all the necessary components for training. You don't need to explicitly place your model on a device because [`~accelerate.Accelerator`] knows which device to move your model to. | ||||
|  | ||||
| ```py | ||||
| from accelerate import Accelerator | ||||
|  | ||||
| accelerator = Accelerator() | ||||
| device = accelerator.device | ||||
| ``` | ||||
|  | ||||
| All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.optimizer.AcceleratedOptimizer`] and [`~accelerate.scheduler.AcceleratedScheduler`], and creates a new shardable dataloader. | ||||
|  | ||||
| ```py | ||||
| train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( | ||||
|     train_dataloader, eval_dataloader, model, optimizer | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| Replace `loss.backward` in your training loop with Accelerates [`~accelerate.Accelerator.backward`] method to scale the gradients and determine the appropriate `backward` method to use depending on your framework (for example, DeepSpeed or Megatron). | ||||
|  | ||||
| ```py | ||||
|   model.train() | ||||
|   for epoch in range(num_epochs): | ||||
|       for batch in train_dataloader: | ||||
| -         batch = {k: v.to(device) for k, v in batch.items()} | ||||
|           outputs = model(**batch) | ||||
|           loss = outputs.loss | ||||
|         accelerator.backward(loss) | ||||
| -         loss.backward() | ||||
| +         accelerator.backward(loss) | ||||
|  | ||||
|           optimizer.step() | ||||
|           lr_scheduler.step() | ||||
|           optimizer.zero_grad() | ||||
|           progress_bar.update(1) | ||||
| ``` | ||||
|  | ||||
| Combine everything into a function and make it callable as a script. | ||||
| ## Train | ||||
|  | ||||
| ```py | ||||
| from accelerate import Accelerator | ||||
| Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory. | ||||
|  | ||||
| def main(): | ||||
|   accelerator = Accelerator() | ||||
| ### Train with a script | ||||
|  | ||||
|   model, optimizer, training_dataloader, scheduler = accelerator.prepare( | ||||
|       model, optimizer, training_dataloader, scheduler | ||||
|   ) | ||||
|  | ||||
|   for batch in training_dataloader: | ||||
|       optimizer.zero_grad() | ||||
|       inputs, targets = batch | ||||
|       outputs = model(inputs) | ||||
|       loss = loss_function(outputs, targets) | ||||
|       accelerator.backward(loss) | ||||
|       optimizer.step() | ||||
|       scheduler.step() | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
| ``` | ||||
|  | ||||
| From the command line, call [accelerate launch](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-launch) to run your training script. Any additional arguments or parameters can be passed here as well. | ||||
|  | ||||
| To launch your training script on two GPUs, add the `--num_processes` argument. | ||||
| If you are running your training from a script, run the following command to create and save a configuration file: | ||||
|  | ||||
| ```bash | ||||
| accelerate launch --num_processes=2 your_script.py | ||||
| accelerate config | ||||
| ``` | ||||
|  | ||||
| Refer to the [Launching Accelerate scripts](https://hf.co/docs/accelerate/main/en/basic_tutorials/launch) for more details. | ||||
| Then launch your training with: | ||||
|  | ||||
| ```bash | ||||
| accelerate launch train.py | ||||
| ``` | ||||
|  | ||||
| ### Train with a notebook | ||||
|  | ||||
| 🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]: | ||||
|  | ||||
| ```py | ||||
| >>> from accelerate import notebook_launcher | ||||
|  | ||||
| >>> notebook_launcher(training_function) | ||||
| ``` | ||||
|  | ||||
| For more information about 🤗 Accelerate and its rich features, refer to the [documentation](https://huggingface.co/docs/accelerate). | ||||
|  | ||||
| @ -1,126 +0,0 @@ | ||||
| <!--Copyright 2025 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Accelerator selection | ||||
|  | ||||
| During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed). | ||||
|  | ||||
| This guide will show you how to select the number of accelerators to use and the order to use them in. | ||||
|  | ||||
| ## Number of accelerators | ||||
|  | ||||
| For example, if there are 4 accelerators and you only want to use the first 2, run the command below. | ||||
|  | ||||
| <hfoptions id="select-accelerator"> | ||||
| <hfoption id="torchrun"> | ||||
|  | ||||
| Use the `--nproc_per_node` to select how many accelerators to use. | ||||
|  | ||||
| ```bash | ||||
| torchrun --nproc_per_node=2  trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="Accelerate"> | ||||
|  | ||||
| Use `--num_processes` to select how many accelerators to use. | ||||
|  | ||||
| ```bash | ||||
| accelerate launch --num_processes 2 trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="DeepSpeed"> | ||||
|  | ||||
| Use `--num_gpus` to select how many GPUs to use. | ||||
|  | ||||
| ```bash | ||||
| deepspeed --num_gpus 2 trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
|  | ||||
| ## Order of accelerators | ||||
| To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file. | ||||
|  | ||||
| For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2: | ||||
|  | ||||
| <hfoptions id="accelerator-type"> | ||||
| <hfoption id="CUDA"> | ||||
|  | ||||
| ```bash | ||||
| CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
| Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.   | ||||
| To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`): | ||||
|  | ||||
|  | ||||
| ```bash | ||||
| CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
| To run without any GPUs: | ||||
|  | ||||
| ```bash | ||||
| CUDA_VISIBLE_DEVICES= python trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
| You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`: | ||||
|  | ||||
| - Order by PCIe bus ID (matches `nvidia-smi`): | ||||
|  | ||||
|     ```bash | ||||
|     export CUDA_DEVICE_ORDER=PCI_BUS_ID | ||||
|     ``` | ||||
|  | ||||
| - Order by compute capability (fastest first): | ||||
|  | ||||
|     ```bash | ||||
|     export CUDA_DEVICE_ORDER=FASTEST_FIRST | ||||
|     ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="Intel XPU"> | ||||
|  | ||||
| ```bash | ||||
| ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
| Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.   | ||||
| To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`): | ||||
|  | ||||
| ```bash | ||||
| ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ... | ||||
| ``` | ||||
|  | ||||
|  | ||||
| You can also control the order of Intel XPUs with: | ||||
|  | ||||
| ```bash | ||||
| export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 | ||||
| ``` | ||||
|  | ||||
| For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation. | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
|  | ||||
|  | ||||
|  | ||||
| > [!WARNING] | ||||
| > Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line. | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,4 +1,4 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
| <!--Copyright 2020 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
| @ -13,66 +13,92 @@ rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Adding a new pipeline | ||||
| # How to create a custom pipeline? | ||||
|  | ||||
| Make [`Pipeline`] your own by subclassing it and implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it. | ||||
| In this guide, we will see how to create a custom pipeline and share it on the [Hub](https://hf.co/models) or add it to the | ||||
| 🤗 Transformers library. | ||||
|  | ||||
| This guide will walk you through the process of adding a new pipeline to Transformers. | ||||
| First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes, | ||||
| dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible | ||||
| as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the | ||||
| pipeline (`preprocess`). | ||||
|  | ||||
| ## Design choices | ||||
| Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of | ||||
| `postprocess` method. | ||||
|  | ||||
| At a minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline. | ||||
| Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`, | ||||
| `_forward`, `postprocess`, and `_sanitize_parameters`. | ||||
|  | ||||
| Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with. | ||||
|  | ||||
| Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data. | ||||
|  | ||||
| ## Create a pipeline | ||||
|  | ||||
| With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods. | ||||
|  | ||||
| ```py | ||||
| ```python | ||||
| from transformers import Pipeline | ||||
|  | ||||
|  | ||||
| class MyPipeline(Pipeline): | ||||
|     def _sanitize_parameters(self, **kwargs): | ||||
|         preprocess_kwargs = {} | ||||
|         if "maybe_arg" in kwargs: | ||||
|             preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] | ||||
|         return preprocess_kwargs, {}, {} | ||||
|  | ||||
|     def preprocess(self, inputs, args=2): | ||||
|  | ||||
|     def _forward(self, model_inputs): | ||||
|  | ||||
|     def postprocess(self, model_outputs): | ||||
| ``` | ||||
|  | ||||
| 1. `preprocess` takes the inputs and transforms them into the appropriate input format for the model. | ||||
|  | ||||
| ```py | ||||
|     def preprocess(self, inputs, maybe_arg=2): | ||||
|         model_input = Tensor(inputs["input_ids"]) | ||||
|         return {"model_input": model_input} | ||||
| ``` | ||||
|  | ||||
| 2. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`. | ||||
|  | ||||
| ```py | ||||
|     def _forward(self, model_inputs): | ||||
|         # model_inputs == {"model_input": model_input} | ||||
|         outputs = self.model(**model_inputs) | ||||
|         # Maybe {"logits": Tensor(...)} | ||||
|         return outputs | ||||
| ``` | ||||
|  | ||||
| 3. `postprocess` generates the final output from the models output in `_forward`. | ||||
|  | ||||
| ```py | ||||
| def postprocess(self, model_outputs, top_k=5): | ||||
|     def postprocess(self, model_outputs): | ||||
|         best_class = model_outputs["logits"].softmax(-1) | ||||
|         return best_class | ||||
| ``` | ||||
|  | ||||
| 4. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural. | ||||
| The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing | ||||
| pre/postprocessing on the CPU on different threads | ||||
|  | ||||
| `preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might | ||||
| contain more information and is usually a `Dict`. | ||||
|  | ||||
| `_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred | ||||
| called method as it contains safeguards to make sure everything is working on the expected device. If anything is | ||||
| linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess. | ||||
|  | ||||
| `postprocess` methods will take the output of `_forward` and turn it into the final output that was decided | ||||
| earlier. | ||||
|  | ||||
| `_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization | ||||
| time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. | ||||
|  | ||||
| The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`, | ||||
| `_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That | ||||
| allows to keep the default arguments in the function definition which is always more "natural". | ||||
|  | ||||
| A classic example would be a `top_k` argument in the post processing in classification tasks. | ||||
|  | ||||
| ```python | ||||
| >>> pipe = pipeline("my-new-task") | ||||
| >>> pipe("This is a test") | ||||
| [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} | ||||
| {"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] | ||||
|  | ||||
| >>> pipe("This is a test", top_k=2) | ||||
| [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] | ||||
| ``` | ||||
|  | ||||
| In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit | ||||
| `_sanitize_parameters` to allow this new parameter. | ||||
|  | ||||
|  | ||||
| ```python | ||||
| def postprocess(self, model_outputs, top_k=5): | ||||
|     best_class = model_outputs["logits"].softmax(-1) | ||||
|     # Add logic to handle top_k | ||||
|     return best_class | ||||
|  | ||||
| For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`. | ||||
|  | ||||
| ```py | ||||
| def _sanitize_parameters(self, **kwargs): | ||||
|     preprocess_kwargs = {} | ||||
|     if "maybe_arg" in kwargs: | ||||
| @ -84,61 +110,55 @@ def _sanitize_parameters(self, **kwargs): | ||||
|     return preprocess_kwargs, {}, postprocess_kwargs | ||||
| ``` | ||||
|  | ||||
| Now the pipeline can return the top most likely labels if a user chooses to. | ||||
| Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy | ||||
| without requiring users to understand new kinds of objects. It's also relatively common to support many different types | ||||
| of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes) | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline("my-task") | ||||
| # returns 3 most likely labels | ||||
| pipeline("This is the best meal I've ever had", top_k=3) | ||||
| # returns 5 most likely labels by default | ||||
| pipeline("This is the best meal I've ever had") | ||||
| ``` | ||||
|  | ||||
| ## Register a pipeline | ||||
| ## Adding it to the list of supported tasks | ||||
|  | ||||
| Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines: | ||||
| To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`: | ||||
|  | ||||
| - the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks) | ||||
| - a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default` | ||||
| - the expected input with `type` | ||||
|  | ||||
| ```py | ||||
| ```python | ||||
| from transformers.pipelines import PIPELINE_REGISTRY | ||||
| from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification | ||||
|  | ||||
| PIPELINE_REGISTRY.register_pipeline( | ||||
|     "new-task", | ||||
|     pipeline_class=MyPipeline, | ||||
|     pt_model=AutoModelForSequenceClassification, | ||||
|     tf_model=TFAutoModelForSequenceClassification, | ||||
|     default={"pt": ("user/awesome-model", "branch-name")}, | ||||
|     type="text", | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| ## Share your pipeline | ||||
| You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type: | ||||
|  | ||||
| Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers. | ||||
| ```python | ||||
| PIPELINE_REGISTRY.register_pipeline( | ||||
|     "new-task", | ||||
|     pipeline_class=MyPipeline, | ||||
|     pt_model=AutoModelForSequenceClassification, | ||||
|     default={"pt": ("user/awesome_model", "abcdef")}, | ||||
|     type="text",  # current support type: text, audio, image, multimodal | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| It's faster to upload your pipeline code to the Hub because it doesn't require a review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure your [`Pipeline`] works. | ||||
| ## Share your pipeline on the Hub | ||||
|  | ||||
| ### Upload to the Hub | ||||
|  | ||||
| Add your pipeline code to the Hub in a Python file. | ||||
|  | ||||
| For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models. | ||||
| To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a | ||||
| python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this: | ||||
|  | ||||
| ```py | ||||
| import numpy as np | ||||
|  | ||||
| from transformers import Pipeline | ||||
|  | ||||
|  | ||||
| def softmax(outputs): | ||||
|     maxes = np.max(outputs, axis=-1, keepdims=True) | ||||
|     shifted_exp = np.exp(outputs - maxes) | ||||
|     return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) | ||||
|  | ||||
|  | ||||
| class PairClassificationPipeline(Pipeline): | ||||
|     def _sanitize_parameters(self, **kwargs): | ||||
|         preprocess_kwargs = {} | ||||
| @ -163,7 +183,8 @@ class PairClassificationPipeline(Pipeline): | ||||
|         return {"label": label, "score": score, "logits": logits} | ||||
| ``` | ||||
|  | ||||
| Save the code in a file named `pair_classification.py`, and import and register it as shown below. | ||||
| The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in | ||||
| a file named `pair_classification.py`, we can then import it and register it like this. | ||||
|  | ||||
| ```py | ||||
| from pair_classification import PairClassificationPipeline | ||||
| @ -194,36 +215,56 @@ The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5f | ||||
|   }, | ||||
| ``` | ||||
|  | ||||
| Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace. | ||||
| Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been | ||||
| fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline(task="pair-classification", model="sgugger/finetuned-bert-mrpc") | ||||
| pipeline.push_to_hub("pair-classification-pipeline") | ||||
| classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") | ||||
| ``` | ||||
|  | ||||
| To use the pipeline, add `trust_remote_code=True` when loading the pipeline. | ||||
| Then we can share it on the Hub by using the `push_to_hub` method: | ||||
|  | ||||
| ```py | ||||
| classifier.push_to_hub("test-dynamic-pipeline") | ||||
| ``` | ||||
|  | ||||
| This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`, | ||||
| along with saving the model and tokenizer of the pipeline, before pushing everything into the repository | ||||
| `{your_username}/test-dynamic-pipeline`. After that, anyone can use it as long as they provide the option | ||||
| `trust_remote_code=True`: | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline(task="pair-classification", trust_remote_code=True) | ||||
| classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) | ||||
| ``` | ||||
|  | ||||
| ### Add to Transformers | ||||
| ## Add the pipeline to 🤗 Transformers | ||||
|  | ||||
| Adding a custom pipeline to Transformers requires adding tests to make sure everything works as expected, and requesting a review from the Transformers team. | ||||
| If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule | ||||
| with the code of your pipeline, then add it to the list of tasks defined in `pipelines/__init__.py`. | ||||
|  | ||||
| Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py). | ||||
| Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with examples of the other tests. | ||||
|  | ||||
| Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline. | ||||
| The `run_pipeline_test` function will be very generic and run on small random models on every possible | ||||
| architecture as defined by `model_mapping` and `tf_model_mapping`. | ||||
|  | ||||
| The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models. | ||||
| This is very important to test future compatibility, meaning if someone adds a new model for | ||||
| `XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's | ||||
| impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the | ||||
| output of the pipeline TYPE. | ||||
|  | ||||
| You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead. | ||||
| You also *need* to implement 2 (ideally 4) tests. | ||||
|  | ||||
| Finally, you should also implement the following 4 tests. | ||||
|  | ||||
| 1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result. | ||||
| 1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow. | ||||
| - `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) | ||||
|   and test the pipeline outputs. The results should be the same as `test_small_model_tf`. | ||||
| - `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) | ||||
|   and test the pipeline outputs. The results should be the same as `test_small_model_pt`. | ||||
| - `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to | ||||
|   make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make | ||||
|   sure there is no drift in future releases. | ||||
| - `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to | ||||
|   make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make | ||||
|   sure there is no drift in future releases. | ||||
|  | ||||
| @ -13,6 +13,419 @@ specific language governing permissions and limitations under the License. | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
| # Agents and tools | ||||
|  | ||||
| [[open-in-colab]] | ||||
|  | ||||
| ### What is an agent? | ||||
|  | ||||
| Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to. | ||||
|  | ||||
| One approach to overcome this weakness is to create an *agent*. | ||||
|  | ||||
| An agent is a system that uses an LLM as its engine, and it has access to functions called *tools*. | ||||
|  | ||||
| These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them. | ||||
|  | ||||
| The agent can be programmed to: | ||||
| - devise a series of actions/tools and run them all at once,  like the [`CodeAgent`] | ||||
| - plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one, like the [`ReactJsonAgent`] | ||||
|  | ||||
| ### Types of agents | ||||
|  | ||||
| #### Code agent | ||||
|  | ||||
| This agent has a planning step, then generates python code to execute all its actions at once. It natively handles different input and output types for its tools, thus it is the recommended choice for multimodal tasks. | ||||
|  | ||||
| #### React agents | ||||
|  | ||||
| This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations. | ||||
|  | ||||
| We implement two versions of ReactJsonAgent:  | ||||
| - [`ReactJsonAgent`] generates tool calls as a JSON in its output. | ||||
| - [`ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance. | ||||
|  | ||||
| > [!TIP] | ||||
| > Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about ReAct agents. | ||||
|  | ||||
| <div class="flex justify-center"> | ||||
|     <img | ||||
|         class="block dark:hidden" | ||||
|         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif" | ||||
|     /> | ||||
|     <img | ||||
|         class="hidden dark:block" | ||||
|         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif" | ||||
|     /> | ||||
| </div> | ||||
|  | ||||
|  | ||||
|  | ||||
| For example, here is how a ReAct Code agent would work its way through the following question. | ||||
|  | ||||
| ```py3 | ||||
| >>> agent.run( | ||||
| ...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", | ||||
| ... ) | ||||
| =====New task===== | ||||
| How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need? | ||||
| ====Agent is executing the code below: | ||||
| bert_blocks = search(query="number of blocks in BERT base encoder") | ||||
| print("BERT blocks:", bert_blocks) | ||||
| ==== | ||||
| Print outputs: | ||||
| BERT blocks: twelve encoder blocks | ||||
|  | ||||
| ====Agent is executing the code below: | ||||
| attention_layer = search(query="number of layers in Attention is All You Need") | ||||
| print("Attention layers:", attention_layer) | ||||
| ==== | ||||
| Print outputs: | ||||
| Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture. | ||||
|  | ||||
| ====Agent is executing the code below: | ||||
| bert_blocks = 12 | ||||
| attention_layers = 6 | ||||
| diff = bert_blocks - attention_layers | ||||
| print("Difference in blocks:", diff) | ||||
| final_answer(diff) | ||||
| ==== | ||||
|  | ||||
| Print outputs: | ||||
| Difference in blocks: 6 | ||||
|  | ||||
| Final answer: 6 | ||||
| ``` | ||||
|  | ||||
| ### How can I build an agent? | ||||
|  | ||||
| To initialize an agent, you need these arguments: | ||||
|  | ||||
| - an LLM to power your agent - the agent is not exactly the LLM, it’s more like the agent is a program that uses an LLM as its engine. | ||||
| - a system prompt: what the LLM engine will be prompted with to generate its output | ||||
| - a toolbox from which the agent pick tools to execute | ||||
| - a parser to extract from the LLM output which tools are to call and with which arguments | ||||
|  | ||||
| Upon initialization of the agent system, the tool attributes are used to generate a tool description, then baked into the agent’s `system_prompt` to let it know which tools it can use and why. | ||||
|  | ||||
| To start with, please install the `agents` extras in order to install all default dependencies. | ||||
|  | ||||
| ```bash | ||||
| pip install transformers[agents] | ||||
| ``` | ||||
|  | ||||
| Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating. | ||||
|  | ||||
| ```python | ||||
| from huggingface_hub import login, InferenceClient | ||||
|  | ||||
| login("<YOUR_HUGGINGFACEHUB_API_TOKEN>") | ||||
|  | ||||
| client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct") | ||||
|  | ||||
| def llm_engine(messages, stop_sequences=["Task"]) -> str: | ||||
|     response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) | ||||
|     answer = response.choices[0].message.content | ||||
|     return answer | ||||
| ``` | ||||
|  | ||||
| You could use any `llm_engine` method as long as: | ||||
| 1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`. | ||||
| 2. it stops generating outputs at the sequences passed in the argument `stop_sequences` | ||||
|  | ||||
| Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs. | ||||
|  | ||||
| You will also need a `tools` argument which accepts a list of `Tools` - it can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`. | ||||
|  | ||||
| Now you can create an agent, like [`CodeAgent`], and run it. You can also create a [`TransformersEngine`] with a pre-initialized pipeline to run inference on your local machine using `transformers`. | ||||
| For convenience, since agentic behaviours generally require stronger models such as `Llama-3.1-70B-Instruct` that are harder to run locally for now, we also provide the [`HfApiEngine`] class that initializes a `huggingface_hub.InferenceClient` under the hood.  | ||||
|  | ||||
| ```python | ||||
| from transformers import CodeAgent, HfApiEngine | ||||
|  | ||||
| llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct") | ||||
| agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) | ||||
|  | ||||
| agent.run( | ||||
|     "Could you translate this sentence from French, say it out loud and return the audio.", | ||||
|     sentence="Où est la boulangerie la plus proche?", | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| This will be handy in case of emergency baguette need! | ||||
| You can even leave the argument `llm_engine` undefined, and an [`HfApiEngine`] will be created by default. | ||||
|  | ||||
| ```python | ||||
| from transformers import CodeAgent | ||||
|  | ||||
| agent = CodeAgent(tools=[], add_base_tools=True) | ||||
|  | ||||
| agent.run( | ||||
|     "Could you translate this sentence from French, say it out loud and give me the audio.", | ||||
|     sentence="Où est la boulangerie la plus proche?", | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| Note that we used an additional `sentence` argument: you can pass text as additional arguments to the model. | ||||
|  | ||||
| You can also use this to indicate the path to local or remote files for the model to use: | ||||
|  | ||||
| ```py | ||||
| from transformers import ReactCodeAgent | ||||
|  | ||||
| agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) | ||||
|  | ||||
| agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3") | ||||
| ``` | ||||
|  | ||||
|  | ||||
| The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent. | ||||
|  | ||||
| ```python | ||||
| print(agent.system_prompt_template) | ||||
| ``` | ||||
|  | ||||
| It's important to explain as clearly as possible the task you want to perform. | ||||
| Every [`~Agent.run`] operation is independent, and since an agent is powered by an LLM, minor variations in your prompt might yield completely different results. | ||||
| You can also run an agent consecutively for different tasks: each time the attributes `agent.task` and `agent.logs` will be re-initialized. | ||||
|  | ||||
|  | ||||
| #### Code execution | ||||
|  | ||||
| A Python interpreter executes the code on a set of inputs passed along with your tools. | ||||
| This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed. | ||||
|  | ||||
| The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue. | ||||
| You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`ReactCodeAgent`] or [`CodeAgent`]: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import ReactCodeAgent | ||||
|  | ||||
| >>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4']) | ||||
| >>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") | ||||
|  | ||||
| (...) | ||||
| 'Hugging Face – Blog' | ||||
| ``` | ||||
|  | ||||
| The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent. | ||||
|  | ||||
| > [!WARNING] | ||||
| > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52. | ||||
| > The LLM can generate arbitrary code that will then be executed: do not add any unsafe imports! | ||||
|  | ||||
| ### The system prompt | ||||
|  | ||||
| An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`ReactCodeAgent`] (below version is slightly simplified). | ||||
|  | ||||
| ```text | ||||
| You will be given a task to solve as best you can. | ||||
| You have access to the following tools: | ||||
| <<tool_descriptions>> | ||||
|  | ||||
| To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. | ||||
|  | ||||
| At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. | ||||
| Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence. | ||||
| During each intermediate step, you can use 'print()' to save whatever important information you will then need. | ||||
| These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. | ||||
|  | ||||
| In the end you have to return a final answer using the `final_answer` tool. | ||||
|  | ||||
| Here are a few examples using notional tools: | ||||
| --- | ||||
| {examples} | ||||
|  | ||||
| Above example were using notional tools that might not exist for you. You only have acces to those tools: | ||||
| <<tool_names>> | ||||
| You also can perform computations in the python code you generate. | ||||
|  | ||||
| Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward. | ||||
|  | ||||
| Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks. | ||||
| Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result. | ||||
|  | ||||
| Remember to make sure that variables you use are all defined. | ||||
|  | ||||
| Now Begin! | ||||
| ``` | ||||
|  | ||||
| The system prompt includes: | ||||
| - An *introduction* that explains how the agent should behave and what tools are. | ||||
| - A description of all the tools that is defined by a `<<tool_descriptions>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user. | ||||
|     - The tool description comes from the tool attributes, `name`, `description`, `inputs` and `output_type`,  and a simple `jinja2` template that you can refine. | ||||
| - The expected output format. | ||||
|  | ||||
| You could improve the system prompt, for example, by adding an explanation of the output format. | ||||
|  | ||||
| For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter. | ||||
|  | ||||
| ```python | ||||
| from transformers import ReactJsonAgent | ||||
| from transformers.agents import PythonInterpreterTool | ||||
|  | ||||
| agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}") | ||||
| ``` | ||||
|  | ||||
| > [!WARNING] | ||||
| > Please make sure to define the `<<tool_descriptions>>` string somewhere in the `template` so the agent is aware  | ||||
| of the available tools. | ||||
|  | ||||
|  | ||||
| ### Inspecting an agent run | ||||
|  | ||||
| Here are a few useful attributes to inspect what happened after a run: | ||||
| - `agent.logs` stores the fine-grained logs of the agent. At every step of the agent's run, everything gets stored in a dictionary that then is appended to `agent.logs`. | ||||
| - Running `agent.write_inner_memory_from_logs()` creates an inner memory of the agent's logs for the LLM to view, as a list of chat messages. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method. | ||||
|  | ||||
| ## Tools | ||||
|  | ||||
| A tool is an atomic function to be used by an agent. | ||||
|  | ||||
| You can for instance check the [`PythonInterpreterTool`]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action. | ||||
|  | ||||
| When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why. | ||||
|  | ||||
| ### Default toolbox | ||||
|  | ||||
| Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`: | ||||
|  | ||||
| - **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut)) | ||||
| - **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt)) | ||||
| - **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper)) | ||||
| - **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5)) | ||||
| - **Translation**: translates a given sentence from source language to target language. | ||||
| - **DuckDuckGo search***: performs a web search using DuckDuckGo browser. | ||||
| - **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`ReactJsonAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code | ||||
|  | ||||
|  | ||||
| You can manually use a tool by calling the [`load_tool`] function and a task to perform. | ||||
|  | ||||
|  | ||||
| ```python | ||||
| from transformers import load_tool | ||||
|  | ||||
| tool = load_tool("text-to-speech") | ||||
| audio = tool("This is a text to speech tool") | ||||
| ``` | ||||
|  | ||||
|  | ||||
| ### Create a new tool | ||||
|  | ||||
| You can create your own tool for use cases not covered by the default tools from Hugging Face. | ||||
| For example, let's create a tool that returns the most downloaded model for a given task from the Hub. | ||||
|  | ||||
| You'll start with the code below. | ||||
|  | ||||
| ```python | ||||
| from huggingface_hub import list_models | ||||
|  | ||||
| task = "text-classification" | ||||
|  | ||||
| model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) | ||||
| print(model.id) | ||||
| ``` | ||||
|  | ||||
| This code can quickly be converted into a tool, just by wrapping it in a function and adding the `tool` decorator: | ||||
|  | ||||
|  | ||||
| ```py | ||||
| from transformers import tool | ||||
|  | ||||
| @tool | ||||
| def model_download_tool(task: str) -> str: | ||||
|     """ | ||||
|     This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. | ||||
|     It returns the name of the checkpoint. | ||||
|  | ||||
|     Args: | ||||
|         task: The task for which | ||||
|     """ | ||||
|     model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1))) | ||||
|     return model.id | ||||
| ``` | ||||
|  | ||||
| The function needs: | ||||
| - A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`. | ||||
| - Type hints on both inputs and output | ||||
| - A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint). | ||||
| All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible! | ||||
|  | ||||
| > [!TIP] | ||||
| > This definition format is the same as tool schemas used in `apply_chat_template`, the only difference is the added `tool` decorator: read more on our tool use API [here](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template). | ||||
|  | ||||
| Then you can directly initialize your agent: | ||||
| ```py | ||||
| from transformers import CodeAgent | ||||
| agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine) | ||||
| agent.run( | ||||
|     "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| You get the following: | ||||
| ```text | ||||
| ======== New task ======== | ||||
| Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub? | ||||
| ==== Agent is executing the code below: | ||||
| most_downloaded_model = model_download_tool(task="text-to-video") | ||||
| print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.") | ||||
| ==== | ||||
| ``` | ||||
|  | ||||
| And the output: | ||||
| `"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."` | ||||
|  | ||||
| ### Manage your agent's toolbox | ||||
|  | ||||
| If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool. | ||||
|  | ||||
| Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox. | ||||
|  | ||||
| ```python | ||||
| from transformers import CodeAgent | ||||
|  | ||||
| agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) | ||||
| agent.toolbox.add_tool(model_download_tool) | ||||
| ``` | ||||
| Now we can leverage both the new tool and the previous text-to-speech tool: | ||||
|  | ||||
| ```python | ||||
| agent.run( | ||||
|     "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?" | ||||
| ) | ||||
| ``` | ||||
|  | ||||
|  | ||||
| | **Audio**                                                                                                                                            | | ||||
| |------------------------------------------------------------------------------------------------------------------------------------------------------| | ||||
| | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> | | ||||
|  | ||||
|  | ||||
| > [!WARNING] | ||||
| > Beware when adding tools to an agent that already works well because it can bias selection towards your tool or select another tool other than the one already defined. | ||||
|  | ||||
|  | ||||
| Use the `agent.toolbox.update_tool()` method to replace an existing tool in the agent's toolbox. | ||||
| This is useful if your new tool is a one-to-one replacement of the existing tool because the agent already knows how to perform that specific task. | ||||
| Just make sure the new tool follows the same API as the replaced tool or adapt the system prompt template to ensure all examples using the replaced tool are updated. | ||||
|  | ||||
|  | ||||
| ### Use a collection of tools | ||||
|  | ||||
| You can leverage tool collections by using the ToolCollection object, with the slug of the collection you want to use. | ||||
| Then pass them as a list to initialize you agent, and start using them! | ||||
|  | ||||
| ```py | ||||
| from transformers import ToolCollection, ReactCodeAgent | ||||
|  | ||||
| image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f") | ||||
| agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True) | ||||
|  | ||||
| agent.run("Please draw me a picture of rivers and lakes.") | ||||
| ``` | ||||
|  | ||||
| To speed up the start, tools are loaded only if called by the agent. | ||||
|  | ||||
| This gets you this image: | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"> | ||||
|  | ||||
							
								
								
									
										261
									
								
								docs/source/en/agents_advanced.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										261
									
								
								docs/source/en/agents_advanced.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,261 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
| # Agents, supercharged - Multi-agents, External tools, and more | ||||
|  | ||||
| [[open-in-colab]] | ||||
|  | ||||
| ### What is an agent? | ||||
|  | ||||
| > [!TIP] | ||||
| > If you're new to `transformers.agents`, make sure to first read the main [agents documentation](./agents). | ||||
|  | ||||
| In this page we're going to highlight several advanced uses of `transformers.agents`. | ||||
|  | ||||
| ## Multi-agents | ||||
|  | ||||
| Multi-agent has been introduced in Microsoft's framework [Autogen](https://huggingface.co/papers/2308.08155). | ||||
| It simply means having several agents working together to solve your task instead of only one. | ||||
| It empirically yields better performance on most benchmarks. The reason for this better performance is conceptually simple: for many tasks, rather than using a do-it-all system, you would prefer to specialize units on sub-tasks. Here, having agents with separate tool sets and memories allows to achieve efficient specialization. | ||||
|  | ||||
| You can easily build hierarchical multi-agent systems with `transformers.agents`. | ||||
|  | ||||
| To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools. | ||||
|  | ||||
| Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]: | ||||
|  | ||||
| ```py | ||||
| from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent | ||||
|  | ||||
| llm_engine = HfApiEngine() | ||||
|  | ||||
| web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine) | ||||
|  | ||||
| managed_web_agent = ManagedAgent( | ||||
|     agent=web_agent, | ||||
|     name="web_search", | ||||
|     description="Runs web searches for you. Give it your query as an argument." | ||||
| ) | ||||
|  | ||||
| manager_agent = ReactCodeAgent( | ||||
|     tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent] | ||||
| ) | ||||
|  | ||||
| manager_agent.run("Who is the CEO of Hugging Face?") | ||||
| ``` | ||||
|  | ||||
| > [!TIP] | ||||
| > For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia). | ||||
|  | ||||
|  | ||||
| ## Advanced tool usage | ||||
|  | ||||
| ### Directly define a tool by subclassing Tool, and share it to the Hub | ||||
|  | ||||
| Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator. | ||||
|  | ||||
| If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass. | ||||
|  | ||||
| The custom tool needs: | ||||
| - An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`. | ||||
| - An attribute `description` is used to populate the agent's system prompt. | ||||
| - An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input. | ||||
| - An `output_type` attribute, which specifies the output type. | ||||
| - A `forward` method which contains the inference code to be executed. | ||||
|  | ||||
| The types for both `inputs` and `output_type` should be amongst [Pydantic formats](https://docs.pydantic.dev/latest/concepts/json_schema/#generating-json-schema). | ||||
|  | ||||
| ```python | ||||
| from transformers import Tool | ||||
| from huggingface_hub import list_models | ||||
|  | ||||
| class HFModelDownloadsTool(Tool): | ||||
|     name = "model_download_counter" | ||||
|     description = """ | ||||
|     This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. | ||||
|     It returns the name of the checkpoint.""" | ||||
|  | ||||
|     inputs = { | ||||
|         "task": { | ||||
|             "type": "string", | ||||
|             "description": "the task category (such as text-classification, depth-estimation, etc)", | ||||
|         } | ||||
|     } | ||||
|     output_type = "string" | ||||
|  | ||||
|     def forward(self, task: str): | ||||
|         model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) | ||||
|         return model.id | ||||
| ``` | ||||
|  | ||||
| Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use. | ||||
|  | ||||
|  | ||||
| ```python | ||||
| from model_downloads import HFModelDownloadsTool | ||||
|  | ||||
| tool = HFModelDownloadsTool() | ||||
| ``` | ||||
|  | ||||
| You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access. | ||||
|  | ||||
| ```python | ||||
| tool.push_to_hub("{your_username}/hf-model-downloads") | ||||
| ``` | ||||
|  | ||||
| Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent. | ||||
|  | ||||
| ```python | ||||
| from transformers import load_tool, CodeAgent | ||||
|  | ||||
| model_download_tool = load_tool("m-ric/hf-model-downloads") | ||||
| ``` | ||||
|  | ||||
| ### Import a Space as a tool 🚀 | ||||
|  | ||||
| You can directly import a Space from the Hub as a tool using the [`Tool.from_space`] method! | ||||
|  | ||||
| You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space. | ||||
|  | ||||
| For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image. | ||||
|  | ||||
| ``` | ||||
| from transformers import Tool | ||||
|  | ||||
| image_generation_tool = Tool.from_space( | ||||
|     "black-forest-labs/FLUX.1-dev", | ||||
|     name="image_generator", | ||||
|     description="Generate an image from a prompt") | ||||
|  | ||||
| image_generation_tool("A sunny beach") | ||||
| ``` | ||||
| And voilà, here's your image! 🏖️ | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sunny_beach.webp"> | ||||
|  | ||||
| Then you can use this tool just like any other tool.  For example, let's improve the prompt  `a rabbit wearing a space suit` and generate an image of it. | ||||
|  | ||||
| ```python | ||||
| from transformers import ReactCodeAgent | ||||
|  | ||||
| agent = ReactCodeAgent(tools=[image_generation_tool]) | ||||
|  | ||||
| agent.run( | ||||
|     "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit' | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| ```text | ||||
| === Agent thoughts: | ||||
| improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background" | ||||
|  | ||||
| Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt. | ||||
| === Agent is executing the code below: | ||||
| image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background") | ||||
| final_answer(image) | ||||
| ``` | ||||
|  | ||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp"> | ||||
|  | ||||
| How cool is this? 🤩 | ||||
|  | ||||
| ### Use gradio-tools | ||||
|  | ||||
| [gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging | ||||
| Face Spaces as tools. It supports many existing Spaces as well as custom Spaces. | ||||
|  | ||||
| Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images. | ||||
|  | ||||
| Import and instantiate the tool, then pass it to the `Tool.from_gradio` method: | ||||
|  | ||||
| ```python | ||||
| from gradio_tools import StableDiffusionPromptGeneratorTool | ||||
| from transformers import Tool, load_tool, CodeAgent | ||||
|  | ||||
| gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool() | ||||
| prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool) | ||||
| ``` | ||||
|  | ||||
| > [!WARNING] | ||||
| > gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible. | ||||
|  | ||||
| ### Use LangChain tools | ||||
|  | ||||
| We love Langchain and think it has a very compelling suite of tools. | ||||
| To import a tool from LangChain, use the `from_langchain()` method. | ||||
|  | ||||
| Here is how you can use it to recreate the intro's search result using a LangChain web search tool. | ||||
| This tool will need `pip install google-search-results` to work properly. | ||||
| ```python | ||||
| from langchain.agents import load_tools | ||||
| from transformers import Tool, ReactCodeAgent | ||||
|  | ||||
| search_tool = Tool.from_langchain(load_tools(["serpapi"])[0]) | ||||
|  | ||||
| agent = ReactCodeAgent(tools=[search_tool]) | ||||
|  | ||||
| agent.run("How many more blocks (also denoted as layers) are in BERT base encoder compared to the encoder from the architecture proposed in Attention is All You Need?") | ||||
| ``` | ||||
|  | ||||
| ## Display your agent run in a cool Gradio interface | ||||
|  | ||||
| You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example: | ||||
|  | ||||
| ```py | ||||
| import gradio as gr | ||||
| from transformers import ( | ||||
|     load_tool, | ||||
|     ReactCodeAgent, | ||||
|     HfApiEngine, | ||||
|     stream_to_gradio, | ||||
| ) | ||||
|  | ||||
| # Import tool from Hub | ||||
| image_generation_tool = load_tool("m-ric/text-to-image") | ||||
|  | ||||
| llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct") | ||||
|  | ||||
| # Initialize the agent with the image generation tool | ||||
| agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine) | ||||
|  | ||||
|  | ||||
| def interact_with_agent(task): | ||||
|     messages = [] | ||||
|     messages.append(gr.ChatMessage(role="user", content=task)) | ||||
|     yield messages | ||||
|     for msg in stream_to_gradio(agent, task): | ||||
|         messages.append(msg) | ||||
|         yield messages + [ | ||||
|             gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!") | ||||
|         ] | ||||
|     yield messages | ||||
|  | ||||
|  | ||||
| with gr.Blocks() as demo: | ||||
|     text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.") | ||||
|     submit = gr.Button("Run illustrator agent!") | ||||
|     chatbot = gr.Chatbot( | ||||
|         label="Agent", | ||||
|         type="messages", | ||||
|         avatar_images=( | ||||
|             None, | ||||
|             "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png", | ||||
|         ), | ||||
|     ) | ||||
|     submit.click(interact_with_agent, [text_input], [chatbot]) | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     demo.launch() | ||||
| ``` | ||||
| @ -1,168 +0,0 @@ | ||||
| <!--Copyright 2025 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Attention Interface | ||||
|  | ||||
| This page describes how to use the `AttentionInterface` in order to register custom attention functions to use with | ||||
| supported models. | ||||
|  | ||||
| ## Customizing attention function | ||||
|  | ||||
| Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping. | ||||
| By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), | ||||
| [`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention) | ||||
| as well as `eager`, which is a simple matrix multiplication without any optimization on top.   | ||||
| This is the setting you can usually choose when instantiating a model: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoModelForCausalLM | ||||
|  | ||||
| model_id = "meta-llama/Llama-3.2-1B" | ||||
|  | ||||
| # Here, using flash attention as an example | ||||
| model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2") | ||||
| ``` | ||||
|  | ||||
| But what if you wanted to create your own attention function? Or simply play around with existing ones, adding | ||||
| a few statements here and there? You can now do so with the `AttentionInterface`! Here is an example: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoModelForCausalLM, AttentionInterface | ||||
| from transformers.integrations.sdpa_attention import sdpa_attention_forward | ||||
| import torch | ||||
|  | ||||
| model_id = "meta-llama/Llama-3.2-1B" | ||||
|  | ||||
| def my_new_sdpa(*args, **kwargs): | ||||
|     print("I just entered the attention computation") | ||||
|     return sdpa_attention_forward(*args, **kwargs) | ||||
|  | ||||
| AttentionInterface.register("my_new_sdpa", my_new_sdpa) | ||||
|  | ||||
| model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_new_sdpa") | ||||
| # Try running the forward with the new attention function | ||||
| model(torch.ones(1, 5, dtype=int)) | ||||
| ``` | ||||
|  | ||||
| You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times). | ||||
|  | ||||
| ## Dynamically switching attention function | ||||
|  | ||||
| You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field: | ||||
|  | ||||
| ```python | ||||
| # Back to use original sdpa implementation | ||||
| model.config._attn_implementation = "sdpa" | ||||
|  | ||||
| model(torch.ones(1, 5, dtype=int)) | ||||
| ``` | ||||
|  | ||||
| and it will stop printing the statements, as it now uses the `sdpa` attention.   | ||||
| This allows to quickly change an attention function, without needing to reload the model! | ||||
|  | ||||
| ## What about new args needed in my custom attention function? | ||||
|  | ||||
| But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the | ||||
| `AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way, | ||||
| you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e. | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoModelForCausalLM, AttentionInterface | ||||
| from transformers.integrations.sdpa_attention import sdpa_attention_forward | ||||
| import torch | ||||
|  | ||||
| def custom_attention( | ||||
|     module: torch.nn.Module,  # required arg | ||||
|     query: torch.Tensor,  # required arg | ||||
|     key: torch.Tensor,  # required arg | ||||
|     value: torch.Tensor,  # required arg | ||||
|     attention_mask: Optional[torch.Tensor],  # required arg | ||||
|     a_new_kwargs = None,  # You can now add as many kwargs as you need | ||||
|     another_new_kwargs = None,  # You can now add as many kwargs as you need | ||||
|     **kwargs,  # You need to accept **kwargs as models will pass other args | ||||
| ) -> Tuple[torch.Tensor, Optional[torch.Tensor]] | ||||
|     ...  # do your magic! | ||||
|     return attn_output, attn_weights  # attn_weights are optional here | ||||
|  | ||||
| AttentionInterface.register("custom", custom_attention) | ||||
|  | ||||
| model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom") | ||||
| # Forward pass with the new kwargs | ||||
| model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...) | ||||
| ``` | ||||
|  | ||||
| If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)! | ||||
|  | ||||
| ## Accessing current available implementations | ||||
|  | ||||
| Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one, | ||||
| and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you | ||||
| would expect from a usual Python dictionary: | ||||
|  | ||||
| ```python | ||||
| >>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS | ||||
|  | ||||
| >>> list(ALL_ATTENTION_FUNCTIONS.keys()) | ||||
| >>> ['flash_attention_2', 'flex_attention', 'sdpa'] | ||||
|  | ||||
| >>> ALL_ATTENTION_FUNCTIONS["sdpa"] | ||||
| >>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward> | ||||
|  | ||||
| >>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None) | ||||
| >>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward> | ||||
|  | ||||
| # You can also globally `register` a new function directly on it | ||||
| >>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func) | ||||
| ``` | ||||
|  | ||||
| ## Attention Mask Interface | ||||
|  | ||||
| Having a new attention function may mean that you need a new format of attention mask to decide what key and value tokens | ||||
| the query tokens should attend to. This is now possible with the `AttentionMaskInterface`! It works in the same way as | ||||
| the `AttentionInterface`: | ||||
|  | ||||
| ```python | ||||
| from transformers import AttentionMaskInterface | ||||
| from transformers.masking_utils import sdpa_mask | ||||
| import torch | ||||
|  | ||||
| def my_new_sdpa_mask(*args, **kwargs): | ||||
|     print("I just entered the attention mask computation") | ||||
|     return sdpa_mask(*args, **kwargs) | ||||
|  | ||||
| AttentionMaskInterface.register("my_new_sdpa_mask", my_new_sdpa_mask) | ||||
| ``` | ||||
|  | ||||
| The reason you have to register it is because we need to automatically correct your mask format based on the attention implementation (for example, flex attention uses a BlockMask format, while sdpa uses a 4D tensor). | ||||
| By default, if you do not register an attention mask function along with your attention function, mask creation will be skipped | ||||
| and `attention_mask=None` will be passed along to the Attention layers. | ||||
|  | ||||
| The default signature of the attention mask functions is the following: | ||||
|  | ||||
| ```python | ||||
| def custom_attention_mask( | ||||
|     batch_size: int,  # required arg | ||||
|     cache_position: torch.Tensor,  # required arg | ||||
|     kv_length: int,  # required arg | ||||
|     kv_offset: int = 0,  # required arg | ||||
|     mask_function: Callable = causal_mask_function,  # required arg | ||||
|     attention_mask: Optional[torch.Tensor] = None,  # required arg | ||||
|     **kwargs,  # a few additional args may be passed as kwargs, especially the model's config is always passed | ||||
| ) -> Optional[torch.Tensor]: | ||||
| ``` | ||||
|  | ||||
| It mostly works thanks to the `mask_function`, which is a `Callable` in the form of [torch's mask_mod functions](https://pytorch.org/blog/flexattention/), taking 4 indices as input and returning a boolean to indicate if this position should take part in the attention computation. | ||||
|  | ||||
| If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py). | ||||
| @ -1,279 +0,0 @@ | ||||
| <!--Copyright 2025 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Utilizing the @auto_docstring Decorator | ||||
|  | ||||
| The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions. | ||||
|  | ||||
| --- | ||||
|  | ||||
| ## 📜 How it Works | ||||
|  | ||||
| The `@auto_docstring` decorator constructs docstrings by: | ||||
|  | ||||
| 1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function. | ||||
| 2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`). | ||||
| 3.  **Overriding or Adding Arguments Descriptions:** | ||||
|     * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions. | ||||
|     * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file. | ||||
| 4.  **Adding Classes and Functions Introduction:** | ||||
|     * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring. | ||||
|     * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source. | ||||
| 5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`. | ||||
| 6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers. | ||||
| 7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block. | ||||
| 8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`. | ||||
|  | ||||
|  | ||||
| --- | ||||
|  | ||||
| ## 🚀 How to Use @auto_docstring | ||||
|  | ||||
| ### 1. Importing the Decorator | ||||
| Import the decorator into your modeling file: | ||||
|  | ||||
| ```python | ||||
| from ...utils import auto_docstring | ||||
| ``` | ||||
|  | ||||
| ### 2. Applying to Classes | ||||
| Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions. | ||||
|  | ||||
| ```python | ||||
| from transformers.modeling_utils import PreTrainedModel | ||||
| from ...utils import auto_docstring | ||||
|  | ||||
| @auto_docstring | ||||
| class MyAwesomeModel(PreTrainedModel): | ||||
|     def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"): | ||||
|         r""" | ||||
|         custom_parameter (`int`, *optional*, defaults to 10): | ||||
|             Description of the custom_parameter for MyAwesomeModel. | ||||
|         another_custom_arg (`str`, *optional*, defaults to "default"): | ||||
|             Documentation for another unique argument. | ||||
|         """ | ||||
|         super().__init__(config) | ||||
|         self.custom_parameter = custom_parameter | ||||
|         self.another_custom_arg = another_custom_arg | ||||
|         # ... rest of your init | ||||
|  | ||||
|     # ... other methods | ||||
| ``` | ||||
|  | ||||
| #### Advanced Class Decoration: | ||||
|  | ||||
| Arguments can be passed directly to `@auto_docstring` for more control: | ||||
|  | ||||
| ```python | ||||
| @auto_docstring( | ||||
|     custom_intro="""This model performs specific synergistic operations. | ||||
|     It builds upon the standard Transformer architecture with unique modifications.""", | ||||
|     custom_args=""" | ||||
|     custom_parameter (`type`, *optional*, defaults to `default_value`): | ||||
|         A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`. | ||||
|     internal_helper_arg (`type`, *optional*, defaults to `default_value`): | ||||
|         A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`. | ||||
|     """ | ||||
| ) | ||||
| class MySpecialModel(PreTrainedModel): | ||||
|     def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None): | ||||
|         # ... | ||||
| ``` | ||||
|  | ||||
| Or: | ||||
|  | ||||
| ```python | ||||
| @auto_docstring( | ||||
|     custom_intro="""This model performs specific synergistic operations. | ||||
|     It builds upon the standard Transformer architecture with unique modifications.""", | ||||
| ) | ||||
| class MySpecialModel(PreTrainedModel): | ||||
|     def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None): | ||||
|         r""" | ||||
|         custom_parameter (`type`, *optional*, defaults to `default_value`): | ||||
|             A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`. | ||||
|         internal_helper_arg (`type`, *optional*, defaults to `default_value`): | ||||
|             A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`. | ||||
|         """ | ||||
|         # ... | ||||
| ``` | ||||
|  | ||||
| ### 3. Applying to Functions (e.g., `forward` method) | ||||
| Apply the decorator above method definitions, such as the `forward` method. | ||||
|  | ||||
| ```python | ||||
|     @auto_docstring | ||||
|     def forward( | ||||
|         self, | ||||
|         input_ids: Optional[torch.Tensor] = None, | ||||
|         attention_mask: Optional[torch.Tensor] = None, | ||||
|         new_custom_argument: Optional[torch.Tensor] = None, | ||||
|         arg_documented_in_args_doc: Optional[torch.Tensor] = None, | ||||
|         # ... other arguments | ||||
|     ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring. | ||||
|         r""" | ||||
|         new_custom_argument (`torch.Tensor`, *optional*): | ||||
|             Description of this new custom argument and its expected shape or type. | ||||
|         """ | ||||
|         # ... | ||||
| ``` | ||||
|  | ||||
| #### Advanced Function Decoration: | ||||
|  | ||||
| Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified: | ||||
|  | ||||
| ```python | ||||
| MODEL_COMMON_CUSTOM_ARGS = r""" | ||||
|     common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`): | ||||
|         Description of common_arg_1 | ||||
|     common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`): | ||||
|         Description of common_arg_2 | ||||
|     ... | ||||
| """ | ||||
|  | ||||
| class MyModel(PreTrainedModel): | ||||
|     # ... | ||||
|     @auto_docstring( | ||||
|         custom_intro=""" | ||||
|         This is a custom introduction for the function. | ||||
|         """ | ||||
|         custom_args=MODEL_COMMON_CUSTOM_ARGS | ||||
|     ) | ||||
|     def forward( | ||||
|         self, | ||||
|         input_ids: Optional[torch.Tensor] = None, | ||||
|         attention_mask: Optional[torch.Tensor] = None, | ||||
|         common_arg_1: Optional[torch.Tensor] = None, | ||||
|         common_arg_2: Optional[torch.Tensor] = None, | ||||
|         #... | ||||
|         function_specific_argument: Optional[torch.Tensor] = None, | ||||
|         # ... other arguments | ||||
|     ) -> torch.Tensor: | ||||
|         r""" | ||||
|         function_specific_argument (`torch.Tensor`, *optional*): | ||||
|             Description of an argument specific to this function | ||||
|  | ||||
|         Returns: | ||||
|             `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified. | ||||
|  | ||||
|         Example: | ||||
|  | ||||
|         (To override the default example with a custom one or to add an example for a model class that does not have a pipeline) | ||||
|  | ||||
|         ```python | ||||
|         ... | ||||
|         ``` | ||||
|         """ | ||||
|         # ... | ||||
| ``` | ||||
|  | ||||
| --- | ||||
|  | ||||
| ### ✍️ Documenting Arguments: Approach & Priority | ||||
|  | ||||
| 1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):** | ||||
|     * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`. | ||||
|  | ||||
| 2.  **New or Custom Arguments:** | ||||
|     * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters). | ||||
|     * **Format:** | ||||
|         ``` | ||||
|         argument_name (`type`, *optional*, defaults to `X`): | ||||
|             Description of the argument. | ||||
|             Explain its purpose, expected shape/type if complex, and default behavior. | ||||
|             This can span multiple lines. | ||||
|         ``` | ||||
|     * Include `type` in backticks. | ||||
|     * Add "*optional*" if the argument is not required (has a default value). | ||||
|     * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`). | ||||
|  | ||||
| 3.  **Overriding Standard Arguments:** | ||||
|     * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence. | ||||
|     * The `labels` argument is often customized per model and typically requires a specific docstring. | ||||
|  | ||||
| 4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):** | ||||
|     * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file. | ||||
|  | ||||
| --- | ||||
|  | ||||
| ### Usage with [modular files](./modular_transformers) | ||||
|  | ||||
| When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator: | ||||
|  | ||||
| - **For standalone models in modular files:** | ||||
|   Apply the `@auto_docstring` decorator just as you would in regular modeling files. | ||||
|  | ||||
| - **For models inheriting from other library models:** | ||||
|   - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file. | ||||
|   - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class. | ||||
|  | ||||
|   > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file. | ||||
|  | ||||
|  | ||||
| **Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly. | ||||
|  | ||||
| --- | ||||
|  | ||||
| ## ✅ Checking Your Docstrings with `check_auto_docstrings` | ||||
|  | ||||
| The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI). | ||||
|  | ||||
| #### What it Checks: | ||||
|  | ||||
| * **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO) | ||||
| * **Argument Completeness & Consistency:** | ||||
|     * Flags arguments in the signature that are not known standard arguments and lack a local description. | ||||
|     * Ensures documented arguments exist in the signature. (TODO) | ||||
|     * Verifies that types and default values in the docstring match the signature. (TODO) | ||||
| * **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`. | ||||
| * **Formatting:** Adherence to the expected docstring style. | ||||
|  | ||||
| #### Running the Check Locally: | ||||
|  | ||||
| Run this check locally before committing. The common command is: | ||||
|  | ||||
| ```bash | ||||
| make fix-copies | ||||
| ``` | ||||
|  | ||||
| Alternatively, to only perform docstrings and auto-docstring checks, you can use: | ||||
|  | ||||
| ```bash | ||||
| python utils/check_docstrings.py # to only check files included in the diff without fixing them | ||||
| # Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff | ||||
| # Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files | ||||
| ``` | ||||
|  | ||||
| #### Workflow with the Checker: | ||||
|  | ||||
| 1.  Add `@auto_docstring(...)` to the class or method. | ||||
| 2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block. | ||||
| 3.  Run `make fix-copies` (or the `check_docstrings.py` utility). | ||||
|     * For unrecognized arguments lacking documentation, the utility will create placeholder entries. | ||||
| 4.  Manually edit these placeholders with accurate types and descriptions. | ||||
| 5.  Re-run the check to ensure all issues are resolved. | ||||
|  | ||||
| --- | ||||
|  | ||||
| ## 🔑 Key Takeaways & Best Practices | ||||
|  | ||||
| * Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.). | ||||
| * For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class. | ||||
| * Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model. | ||||
| * Document new or custom arguments clearly. | ||||
| * Run `check_docstrings` locally and iteratively. | ||||
|  | ||||
| By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗. | ||||
							
								
								
									
										189
									
								
								docs/source/en/autoclass_tutorial.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										189
									
								
								docs/source/en/autoclass_tutorial.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,189 @@ | ||||
| <!--Copyright 2022 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Load pretrained instances with an AutoClass | ||||
|  | ||||
| With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infers and loads the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different. | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/google-bert/bert-base-uncased) is an architecture, while `google-bert/bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| In this tutorial, learn to: | ||||
|  | ||||
| * Load a pretrained tokenizer. | ||||
| * Load a pretrained image processor | ||||
| * Load a pretrained feature extractor. | ||||
| * Load a pretrained processor. | ||||
| * Load a pretrained model. | ||||
| * Load a model as a backbone. | ||||
|  | ||||
| ## AutoTokenizer | ||||
|  | ||||
| Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model. | ||||
|  | ||||
| Load a tokenizer with [`AutoTokenizer.from_pretrained`]: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoTokenizer | ||||
|  | ||||
| >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") | ||||
| ``` | ||||
|  | ||||
| Then tokenize your input as shown below: | ||||
|  | ||||
| ```py | ||||
| >>> sequence = "In a hole in the ground there lived a hobbit." | ||||
| >>> print(tokenizer(sequence)) | ||||
| {'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102],  | ||||
|  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  | ||||
|  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} | ||||
| ``` | ||||
|  | ||||
| ## AutoImageProcessor | ||||
|  | ||||
| For vision tasks, an image processor processes the image into the correct input format. | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoImageProcessor | ||||
|  | ||||
| >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") | ||||
| ``` | ||||
|  | ||||
| ## AutoBackbone | ||||
|  | ||||
| <div style="text-align: center"> | ||||
|     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stages.png"> | ||||
|     <figcaption class="mt-2 text-center text-sm text-gray-500">A Swin backbone with multiple stages for outputting a feature map.</figcaption> | ||||
| </div> | ||||
|  | ||||
| The [`AutoBackbone`] lets you use pretrained models as backbones to get feature maps from different stages of the backbone. You should specify one of the following parameters in [`~PretrainedConfig.from_pretrained`]: | ||||
|  | ||||
| * `out_indices` is the index of the layer you'd like to get the feature map from | ||||
| * `out_features` is the name of the layer you'd like to get the feature map from | ||||
|  | ||||
| These parameters can be used interchangeably, but if you use both, make sure they're aligned with each other! If you don't pass any of these parameters, the backbone returns the feature map from the last layer. | ||||
|  | ||||
| <div style="text-align: center"> | ||||
|     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png"> | ||||
|     <figcaption class="mt-2 text-center text-sm text-gray-500">A feature map from the first stage of the backbone. The patch partition refers to the model stem.</figcaption> | ||||
| </div> | ||||
|  | ||||
| For example, in the above diagram, to return the feature map from the first stage of the Swin backbone, you can set `out_indices=(1,)`: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoImageProcessor, AutoBackbone | ||||
| >>> import torch | ||||
| >>> from PIL import Image | ||||
| >>> import requests | ||||
| >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" | ||||
| >>> image = Image.open(requests.get(url, stream=True).raw) | ||||
| >>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") | ||||
| >>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) | ||||
|  | ||||
| >>> inputs = processor(image, return_tensors="pt") | ||||
| >>> outputs = model(**inputs) | ||||
| >>> feature_maps = outputs.feature_maps | ||||
| ``` | ||||
|  | ||||
| Now you can access the `feature_maps` object from the first stage of the backbone: | ||||
|  | ||||
| ```py | ||||
| >>> list(feature_maps[0].shape) | ||||
| [1, 96, 56, 56] | ||||
| ``` | ||||
|  | ||||
| ## AutoFeatureExtractor | ||||
|  | ||||
| For audio tasks, a feature extractor processes the audio signal into the correct input format. | ||||
|  | ||||
| Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoFeatureExtractor | ||||
|  | ||||
| >>> feature_extractor = AutoFeatureExtractor.from_pretrained( | ||||
| ...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" | ||||
| ... ) | ||||
| ``` | ||||
|  | ||||
| ## AutoProcessor | ||||
|  | ||||
| Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them. | ||||
|  | ||||
| Load a processor with [`AutoProcessor.from_pretrained`]: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoProcessor | ||||
|  | ||||
| >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") | ||||
| ``` | ||||
|  | ||||
| ## AutoModel | ||||
|  | ||||
| <frameworkcontent> | ||||
| <pt> | ||||
| The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]. | ||||
|  | ||||
| > [!WARNING] | ||||
| > By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoModelForSequenceClassification | ||||
|  | ||||
| >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") | ||||
| ``` | ||||
|  | ||||
| Easily reuse the same checkpoint to load an architecture for a different task: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import AutoModelForTokenClassification | ||||
|  | ||||
| >>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") | ||||
| ``` | ||||
|  | ||||
| <Tip warning={true}> | ||||
|  | ||||
| For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG. | ||||
|  | ||||
| TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. | ||||
| </pt> | ||||
| <tf> | ||||
| Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import TFAutoModelForSequenceClassification | ||||
|  | ||||
| >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") | ||||
| ``` | ||||
|  | ||||
| Easily reuse the same checkpoint to load an architecture for a different task: | ||||
|  | ||||
| ```py | ||||
| >>> from transformers import TFAutoModelForTokenClassification | ||||
|  | ||||
| >>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") | ||||
| ``` | ||||
|  | ||||
| Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. | ||||
| </tf> | ||||
| </frameworkcontent> | ||||
| @ -1,155 +0,0 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Backbones | ||||
|  | ||||
| Higher-level computer visions tasks, such as object detection or image segmentation, use several models together to generate a prediction. A separate model is used for the *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction. | ||||
|  | ||||
| <div class="flex justify-center"> | ||||
|     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png"/> | ||||
| </div> | ||||
|  | ||||
| Load a backbone with [`~PretrainedConfig.from_pretrained`] and use the `out_indices` parameter to determine which layer, given by the index, to extract a feature map from. | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoBackbone | ||||
|  | ||||
| model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) | ||||
| ``` | ||||
|  | ||||
| This guide describes the backbone class, backbones from the [timm](https://hf.co/docs/timm/index) library, and how to extract features with them. | ||||
|  | ||||
| ## Backbone classes | ||||
|  | ||||
| There are two backbone classes. | ||||
|  | ||||
| - [`~transformers.utils.BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices. | ||||
| - [`~transformers.utils.BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration. | ||||
|  | ||||
| Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone. | ||||
|  | ||||
| There are two ways to load a Transformers backbone, [`AutoBackbone`] and a model-specific backbone class. | ||||
|  | ||||
| <hfoptions id="backbone-classes"> | ||||
| <hfoption id="AutoBackbone"> | ||||
|  | ||||
| The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~PretrainedConfig.from_pretrained`] as a backbone if it's supported. | ||||
|  | ||||
| Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you know the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they refer to the same layer. | ||||
|  | ||||
| When `out_indices` or `out_features` isn't used, the backbone returns the feature map from the last layer. The example code below uses `out_indices=(1,)` to get the feature map from the first layer. | ||||
|  | ||||
| <div class="flex justify-center"> | ||||
|     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png"/> | ||||
| </div> | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoImageProcessor, AutoBackbone | ||||
|  | ||||
| model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="model-specific backbone"> | ||||
|  | ||||
| When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Pass the configuration to the model to initialize it for a task. | ||||
|  | ||||
| The example below loads a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head. | ||||
|  | ||||
| Set `backbone` to a pretrained model and  `use_pretrained_backbone=True` to use pretrained weights instead of randomly initialized weights. | ||||
|  | ||||
| ```py | ||||
| from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation | ||||
|  | ||||
| config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) | ||||
| model = MaskFormerForInstanceSegmentation(config) | ||||
| ``` | ||||
|  | ||||
| Another option is to separately load the backbone configuration and then pass it to `backbone_config` in the model configuration. | ||||
|  | ||||
| ```py | ||||
| from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig | ||||
|  | ||||
| # instantiate backbone configuration | ||||
| backbone_config = ResNetConfig() | ||||
| # load backbone in model | ||||
| config = MaskFormerConfig(backbone_config=backbone_config) | ||||
| # attach backbone to model head | ||||
| model = MaskFormerForInstanceSegmentation(config) | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
|  | ||||
| ## timm backbones | ||||
|  | ||||
| [timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes. | ||||
|  | ||||
| Set `use_timm_backbone=True` to load pretrained timm weights, and `use_pretrained_backbone` to use pretrained or randomly initialized weights. | ||||
|  | ||||
| ```py | ||||
| from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation | ||||
|  | ||||
| config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=True) | ||||
| model = MaskFormerForInstanceSegmentation(config) | ||||
| ``` | ||||
|  | ||||
| You could also explicitly call the [`TimmBackboneConfig`] class to load and create a pretrained timm backbone. | ||||
|  | ||||
| ```py | ||||
| from transformers import TimmBackboneConfig | ||||
|  | ||||
| backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True) | ||||
| ``` | ||||
|  | ||||
| Pass the backbone configuration to the model configuration and instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone. | ||||
|  | ||||
| ```py | ||||
| from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation | ||||
|  | ||||
| config = MaskFormerConfig(backbone_config=backbone_config) | ||||
| model = MaskFormerForInstanceSegmentation(config) | ||||
| ``` | ||||
|  | ||||
| ## Feature extraction | ||||
|  | ||||
| The backbone is used to extract image features. Pass an image through the backbone to get the feature maps. | ||||
|  | ||||
| Load and preprocess an image and pass it to the backbone. The example below extracts the feature maps from the first layer. | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoImageProcessor, AutoBackbone | ||||
| import torch | ||||
| from PIL import Image | ||||
| import requests | ||||
|  | ||||
| model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) | ||||
| processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") | ||||
|  | ||||
| url = "http://images.cocodataset.org/val2017/000000039769.jpg" | ||||
| image = Image.open(requests.get(url, stream=True).raw) | ||||
|  | ||||
| inputs = processor(image, return_tensors="pt") | ||||
| outputs = model(**inputs) | ||||
| ``` | ||||
|  | ||||
| The features are stored and accessed from the outputs `feature_maps` attribute. | ||||
|  | ||||
| ```py | ||||
| feature_maps = outputs.feature_maps | ||||
| list(feature_maps[0].shape) | ||||
| [1, 96, 56, 56] | ||||
| ``` | ||||
							
								
								
									
										41
									
								
								docs/source/en/bertology.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								docs/source/en/bertology.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,41 @@ | ||||
| <!--Copyright 2020 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # BERTology | ||||
|  | ||||
| There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT | ||||
| (that some call "BERTology"). Some good examples of this field are: | ||||
|  | ||||
|  | ||||
| - BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: | ||||
|   https://arxiv.org/abs/1905.05950 | ||||
| - Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 | ||||
| - What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. | ||||
|   Manning: https://arxiv.org/abs/1906.04341 | ||||
| - CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633 | ||||
|  | ||||
| In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to | ||||
| help people access the inner representations, mainly adapted from the great work of Paul Michel | ||||
| (https://arxiv.org/abs/1905.10650): | ||||
|  | ||||
|  | ||||
| - accessing all the hidden-states of BERT/GPT/GPT-2, | ||||
| - accessing all the attention weights for each head of BERT/GPT/GPT-2, | ||||
| - retrieving heads output values and gradients to be able to compute head importance score and prune head as explained | ||||
|   in https://arxiv.org/abs/1905.10650. | ||||
|  | ||||
| To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) which extracts information and prune a model pre-trained on | ||||
| GLUE. | ||||
							
								
								
									
										215
									
								
								docs/source/en/big_models.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										215
									
								
								docs/source/en/big_models.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,215 @@ | ||||
| <!--Copyright 2022 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Instantiate a big model | ||||
|  | ||||
| A barrier to accessing very large pretrained models is the amount of memory required. When loading a pretrained PyTorch model, you usually: | ||||
|  | ||||
| 1. Create a model with random weights. | ||||
| 2. Load your pretrained weights. | ||||
| 3. Put those pretrained weights in the model. | ||||
|  | ||||
| The first two steps both require a full version of the model in memory and if the model weighs several GBs, you may not have enough memory for two copies of it. This problem is amplified in distributed training environments because each process loads a pretrained model and stores two copies in memory. | ||||
|  | ||||
| > [!TIP] | ||||
| > The randomly created model is initialized with "empty" tensors, which take space in memory without filling it. The random values are whatever was in this chunk of memory at the time. To improve loading speed, the [`_fast_init`](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter is set to `True` by default to skip the random initialization for all weights that are correctly loaded. | ||||
|  | ||||
| This guide will show you how Transformers can help you load large pretrained models despite their memory requirements. | ||||
|  | ||||
| ## Sharded checkpoints | ||||
|  | ||||
| From Transformers v4.18.0, a checkpoint larger than 10GB is automatically sharded by the [`~PreTrainedModel.save_pretrained`] method. It is split into several smaller partial checkpoints and creates an index file that maps parameter names to the files they're stored in. | ||||
|  | ||||
| The maximum shard size is controlled with the `max_shard_size` parameter, but by default it is 5GB, because it is easier to run on free-tier GPU instances without running out of memory. | ||||
|  | ||||
| For example, let's shard [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B). | ||||
|  | ||||
| ```py | ||||
| >>> with tempfile.TemporaryDirectory() as tmp_dir: | ||||
| ...     model.save_pretrained(tmp_dir, max_shard_size="5GB") | ||||
| ...     print(sorted(os.listdir(tmp_dir))) | ||||
| ['config.json', 'generation_config.json', 'model-00001-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model.safetensors.index.json'] | ||||
| ``` | ||||
|  | ||||
| The sharded checkpoint is reloaded with the [`~PreTrainedModel.from_pretrained`] method. | ||||
|  | ||||
| ```py | ||||
| >>> with tempfile.TemporaryDirectory() as tmp_dir: | ||||
| ...     model.save_pretrained(tmp_dir, max_shard_size="5GB") | ||||
| ...     new_model = AutoModel.from_pretrained(tmp_dir) | ||||
| ``` | ||||
|  | ||||
| The main advantage of sharded checkpoints for big models is that each shard is loaded after the previous one, which caps the memory usage to only the model size and the largest shard size. | ||||
|  | ||||
| You could also directly load a sharded checkpoint inside a model without the [`~PreTrainedModel.from_pretrained`] method (similar to PyTorch's `load_state_dict()` method for a full checkpoint). In this case, use the [`~modeling_utils.load_sharded_checkpoint`] method. | ||||
|  | ||||
| ```py | ||||
| >>> from transformers.modeling_utils import load_sharded_checkpoint | ||||
|  | ||||
| >>> with tempfile.TemporaryDirectory() as tmp_dir: | ||||
| ...     model.save_pretrained(tmp_dir, max_shard_size="5GB") | ||||
| ...     load_sharded_checkpoint(model, tmp_dir) | ||||
| ``` | ||||
|  | ||||
| ### Shard metadata | ||||
|  | ||||
| The index file determines which keys are in the checkpoint and where the corresponding weights are stored. This file is loaded like any other JSON file and you can get a dictionary from it. | ||||
|  | ||||
| ```py | ||||
| >>> import json | ||||
|  | ||||
| >>> with tempfile.TemporaryDirectory() as tmp_dir: | ||||
| ...     model.save_pretrained(tmp_dir, max_shard_size="5GB") | ||||
| ...     with open(os.path.join(tmp_dir, "model.safetensors.index.json"), "r") as f: | ||||
| ...         index = json.load(f) | ||||
|  | ||||
| >>> print(index.keys()) | ||||
| dict_keys(['metadata', 'weight_map']) | ||||
| ``` | ||||
|  | ||||
| The `metadata` key provides the total model size. | ||||
|  | ||||
| ```py | ||||
| >>> index["metadata"] | ||||
| {'total_size': 28966928384} | ||||
| ``` | ||||
|  | ||||
| The `weight_map` key maps each parameter name (typically `state_dict` in a PyTorch model) to the shard it's stored in. | ||||
|  | ||||
| ```py | ||||
| >>> index["weight_map"] | ||||
| {'lm_head.weight': 'model-00006-of-00006.safetensors', | ||||
|  'model.embed_tokens.weight': 'model-00001-of-00006.safetensors', | ||||
|  'model.layers.0.input_layernorm.weight': 'model-00001-of-00006.safetensors', | ||||
|  'model.layers.0.mlp.down_proj.weight': 'model-00001-of-00006.safetensors', | ||||
|  ... | ||||
| } | ||||
| ``` | ||||
|  | ||||
| ## Accelerate's Big Model Inference | ||||
|  | ||||
| > [!TIP] | ||||
| > Make sure you have Accelerate v0.9.0 or later and PyTorch v1.9.0 or later installed. | ||||
|  | ||||
| From Transformers v4.20.0, the [`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature to efficiently handle really big models! Big Model Inference creates a *model skeleton* on PyTorch's [**meta**](https://pytorch.org/docs/main/meta.html) device. The randomly initialized parameters are only created when the pretrained weights are loaded. This way, you aren't keeping two copies of the model in memory at the same time (one for the randomly initialized model and one for the pretrained weights), and the maximum memory consumed is only the full model size. | ||||
|  | ||||
| To enable Big Model Inference in Transformers, set `low_cpu_mem_usage=True` in the [`~PreTrainedModel.from_pretrained`] method. | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoModelForCausalLM | ||||
|  | ||||
| gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", low_cpu_mem_usage=True) | ||||
| ``` | ||||
|  | ||||
| Accelerate automatically dispatches the model weights across all available devices, starting with the fastest device (GPU) first and then offloading to the slower devices (CPU and even hard drive). This is enabled by setting `device_map="auto"` in the [`~PreTrainedModel.from_pretrained`] method. When you pass the `device_map` parameter, `low_cpu_mem_usage` is automatically set to `True` so you don't need to specify it. | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoModelForCausalLM | ||||
|  | ||||
| # these loading methods are equivalent | ||||
| gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto") | ||||
| gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto", low_cpu_mem_usage=True) | ||||
| ``` | ||||
|  | ||||
| You can also write your own `device_map` by mapping each layer to a device. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device. | ||||
|  | ||||
| ```python | ||||
| device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"} | ||||
| ``` | ||||
|  | ||||
| Access `hf_device_map` attribute to see how Accelerate split the model across devices. | ||||
|  | ||||
| ```py | ||||
| gemma.hf_device_map | ||||
| ``` | ||||
|  | ||||
| ```python out | ||||
| {'model.embed_tokens': 0, | ||||
|  'model.layers.0': 0, | ||||
|  'model.layers.1': 0, | ||||
|  'model.layers.2': 0, | ||||
|  'model.layers.3': 0, | ||||
|  'model.layers.4': 0, | ||||
|  'model.layers.5': 0, | ||||
|  'model.layers.6': 0, | ||||
|  'model.layers.7': 0, | ||||
|  'model.layers.8': 0, | ||||
|  'model.layers.9': 0, | ||||
|  'model.layers.10': 0, | ||||
|  'model.layers.11': 0, | ||||
|  'model.layers.12': 0, | ||||
|  'model.layers.13': 0, | ||||
|  'model.layers.14': 'cpu', | ||||
|  'model.layers.15': 'cpu', | ||||
|  'model.layers.16': 'cpu', | ||||
|  'model.layers.17': 'cpu', | ||||
|  'model.layers.18': 'cpu', | ||||
|  'model.layers.19': 'cpu', | ||||
|  'model.layers.20': 'cpu', | ||||
|  'model.layers.21': 'cpu', | ||||
|  'model.layers.22': 'cpu', | ||||
|  'model.layers.23': 'cpu', | ||||
|  'model.layers.24': 'cpu', | ||||
|  'model.layers.25': 'cpu', | ||||
|  'model.layers.26': 'cpu', | ||||
|  'model.layers.27': 'cpu', | ||||
|  'model.layers.28': 'cpu', | ||||
|  'model.layers.29': 'cpu', | ||||
|  'model.layers.30': 'cpu', | ||||
|  'model.layers.31': 'cpu', | ||||
|  'model.norm': 'cpu', | ||||
|  'lm_head': 'cpu'} | ||||
| ``` | ||||
|  | ||||
| ## Model data type | ||||
|  | ||||
| PyTorch model weights are normally instantiated as torch.float32 and it can be an issue if you try to load a model as a different data type. For example, you'd need twice as much memory to load the weights in torch.float32 and then again to load them in your desired data type, like torch.float16. | ||||
|  | ||||
| > [!WARNING] | ||||
| > Due to how PyTorch is designed, the `torch_dtype` parameter only supports floating data types. | ||||
|  | ||||
| To avoid wasting memory like this, explicitly set the `torch_dtype` parameter to the desired data type or set `torch_dtype="auto"` to load the weights with the most optimal memory pattern (the data type is automatically derived from the model weights). | ||||
|  | ||||
| <hfoptions id="dtype"> | ||||
| <hfoption id="specific dtype"> | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoModelForCausalLM | ||||
|  | ||||
| gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16) | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="auto dtype"> | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoModelForCausalLM | ||||
|  | ||||
| gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype="auto") | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
|  | ||||
| You can also set the data type to use for models instantiated from scratch. | ||||
|  | ||||
| ```python | ||||
| import torch | ||||
| from transformers import AutoConfig, AutoModel | ||||
|  | ||||
| my_config = AutoConfig.from_pretrained("google/gemma-2b", torch_dtype=torch.float16) | ||||
| model = AutoModel.from_config(my_config) | ||||
| ``` | ||||
| @ -1,162 +0,0 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Caching | ||||
| Imagine you're having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right? | ||||
|  | ||||
| You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context. | ||||
|  | ||||
| To predict the 1000th token, the model requires information from the previous 999 tokens. The information is represented as matrix multiplications across the token representations. | ||||
|  | ||||
| To predict the 1001th token, you need the same information from the previous 999 tokens in addition to any information from the 1000th token. This is a lot of matrix multiplications a model has to compute over and over for each token! | ||||
|  | ||||
| A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute. | ||||
|  | ||||
| > [!WARNING] | ||||
| > Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training. | ||||
|  | ||||
| To better understand how and why caching works, let's take a closer look at the structure of the attention matrices. | ||||
|  | ||||
| ## Attention matrices | ||||
|  | ||||
| The **scaled dot-product attention** is calculated as shown below for a batch of size `b`, number of attention heads `h`, sequence length so far `T`, and dimension per attention head `d_head`. | ||||
|  | ||||
| $$ | ||||
| \text{Attention}(Q, K, V) = \text{softmax}\left( \frac{Q K^\top}{\sqrt{d_{\text{head}}}} \times \text{mask} \right) V | ||||
| $$ | ||||
|  | ||||
| The query (`Q`), key (`K`), and value (`V`) matrices are projections from the input embeddings of shape `(b, h, T, d_head)`. | ||||
|  | ||||
| For causal attention, the mask prevents the model from attending to future tokens. Once a token is processed, its representation never changes with respect to future tokens, which means \\( K_{\text{past}} \\) and \\( V_{\text{past}} \\) can be cached and reused to compute the last token's representation. | ||||
|  | ||||
| $$ | ||||
| \text{Attention}(q_t, [\underbrace{k_1, k_2, \dots, k_{t-1}}_{\text{cached}}, k_{t}], [\underbrace{v_1, v_2, \dots, v_{t-1}}_{\text{cached}}, v_{t}]) | ||||
| $$ | ||||
|  | ||||
| At inference time, you only need the last token's query to compute the representation \\( x_t \\) that predicts the next token \\( t+1 \\). At each step, the new key and value vectors are **stored** in the cache and **appended** to the past keys and values. | ||||
|  | ||||
| $$ | ||||
| K_{\text{cache}} \leftarrow \text{concat}(K_{\text{past}}, k_t), \quad V_{\text{cache}} \leftarrow \text{concat}(V_{\text{past}}, v_t) | ||||
| $$ | ||||
|  | ||||
| Attention is calculated independently in each layer of the model, and caching is done on a per-layer basis. | ||||
|  | ||||
| Refer to the table below to compare how caching improves efficiency. | ||||
|  | ||||
| | without caching | with caching | | ||||
| |---|---| | ||||
| | for each step, recompute all previous `K` and `V`  | for each step, only compute current `K` and `V`  | ||||
| | attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) | | ||||
|  | ||||
|  | ||||
|  | ||||
| ## Cache class | ||||
|  | ||||
| A basic KV cache interface takes a key and value tensor for the current token and returns the updated `K` and `V` tensors. This is internally managed by a model's `forward` method. | ||||
|  | ||||
| ```py | ||||
| new_K, new_V = cache.update(k_t, v_t, layer_idx) | ||||
| attn_output = attn_layer_idx_fn(q_t, new_K, new_V) | ||||
| ``` | ||||
|  | ||||
| When you use Transformers' [`Cache`] class, the self-attention module performs several critical steps to integrate past and present information. | ||||
|  | ||||
| 1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input. | ||||
|  | ||||
| 2. When the `forward` method is called iteratively, it's crucial that the attention mask shape matches the combined length of the past and current kv pairs. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is typically handled internally in [`~GenerationMixin.generate`], but if you want to implement your own generation loop with [`Cache`], keep this in mind! The attention mask should hold the past and current token values. | ||||
|  | ||||
| 3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10])`. | ||||
|  | ||||
| ## Cache storage implementation | ||||
|  | ||||
| The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`]. | ||||
|  | ||||
|  | ||||
| In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`. | ||||
| - `key_cache`: A list of tensors, one for each layer. | ||||
| - `value_cache`: A list of tensors, one for each layer. | ||||
|  | ||||
| When new tokens are processed: | ||||
|  | ||||
| 1. For each layer, the new key and value states are concatenated with the existing cache. | ||||
| ```py | ||||
| self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) | ||||
| self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) | ||||
| ``` | ||||
|  | ||||
| 2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token. | ||||
|  | ||||
| 3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token. | ||||
|  | ||||
| The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token. | ||||
|  | ||||
| ```py | ||||
| import torch | ||||
| from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache | ||||
|  | ||||
| model_id = "meta-llama/Llama-2-7b-chat-hf" | ||||
| model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0") | ||||
| tokenizer = AutoTokenizer.from_pretrained(model_id) | ||||
|  | ||||
| past_key_values = DynamicCache() | ||||
| messages = [{"role": "user", "content": "Hello, what's your name."}] | ||||
| inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0") | ||||
|  | ||||
| generated_ids = inputs.input_ids | ||||
| cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0") | ||||
| max_new_tokens = 10 | ||||
|  | ||||
| for _ in range(max_new_tokens): | ||||
|     outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True) | ||||
|     # Greedily sample one next token | ||||
|     next_token_ids = outputs.logits[:, -1:].argmax(-1) | ||||
|     generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) | ||||
|     # Prepare inputs for the next generation step by leaving unprocessed tokens, in our case we have only one new token | ||||
|     # and expanding attn mask for the new token, as explained above | ||||
|     attention_mask = inputs["attention_mask"] | ||||
|     attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1) | ||||
|     inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask} | ||||
|     cache_position = cache_position[-1:] + 1 # add one more position for the next token | ||||
|  | ||||
| print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]) | ||||
| "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA," | ||||
| ``` | ||||
| ## Legacy cache format | ||||
|  | ||||
| Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`]. | ||||
|  | ||||
| The legacy format is essentially the same data structure but organized differently. | ||||
| - It's a tuple of tuples, where each inner tuple contains the key and value tensors for a layer. | ||||
| - The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`. | ||||
| - The format is less flexible and doesn't support features like quantization or offloading. | ||||
|  | ||||
| If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format. | ||||
|  | ||||
| ```py | ||||
| import torch | ||||
| from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") | ||||
| model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") | ||||
| inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) | ||||
|  | ||||
| # `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache | ||||
| # in the legacy format | ||||
| generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5) | ||||
|  | ||||
| cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values) | ||||
| legacy_format_cache = cache.to_legacy_cache() | ||||
| ``` | ||||
| @ -1,299 +0,0 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Tools and RAG | ||||
|  | ||||
| The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases. | ||||
|  | ||||
| This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG). | ||||
|  | ||||
| ## Tools | ||||
|  | ||||
| Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases. | ||||
|  | ||||
| Follow the rules below when creating a tool. | ||||
|  | ||||
| 1. The function should have a descriptive name. | ||||
| 2. The function arguments must have a type hint in the function header (don't include in the `Args` block). | ||||
| 3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring. | ||||
| 4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them. | ||||
|  | ||||
| An example tool to get temperature and wind speed is shown below. | ||||
|  | ||||
| ```py | ||||
| def get_current_temperature(location: str, unit: str) -> float: | ||||
|     """ | ||||
|     Get the current temperature at a location. | ||||
|      | ||||
|     Args: | ||||
|         location: The location to get the temperature for, in the format "City, Country" | ||||
|         unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"]) | ||||
|     Returns: | ||||
|         The current temperature at the specified location in the specified units, as a float. | ||||
|     """ | ||||
|     return 22.  # A real function should probably actually get the temperature! | ||||
|  | ||||
| def get_current_wind_speed(location: str) -> float: | ||||
|     """ | ||||
|     Get the current wind speed in km/h at a given location. | ||||
|      | ||||
|     Args: | ||||
|         location: The location to get the temperature for, in the format "City, Country" | ||||
|     Returns: | ||||
|         The current wind speed at the given location in km/h, as a float. | ||||
|     """ | ||||
|     return 6.  # A real function should probably actually get the wind speed! | ||||
|  | ||||
| tools = [get_current_temperature, get_current_wind_speed] | ||||
| ``` | ||||
|  | ||||
| Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it. | ||||
|  | ||||
| ```py | ||||
| import torch | ||||
| from transformers import AutoModelForCausalLM, AutoTokenizer | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B") | ||||
| tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B") | ||||
| model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto") | ||||
| ``` | ||||
|  | ||||
| Create a chat message. | ||||
|  | ||||
| ```py | ||||
| messages = [ | ||||
|   {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."}, | ||||
|   {"role": "user", "content": "Hey, what's the temperature in Paris right now?"} | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation. | ||||
|  | ||||
| ```py | ||||
| inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") | ||||
| inputs = {k: v for k, v in inputs.items()} | ||||
| outputs = model.generate(**inputs, max_new_tokens=128) | ||||
| print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):])) | ||||
| ``` | ||||
|  | ||||
| ```txt | ||||
| <tool_call> | ||||
| {"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"} | ||||
| </tool_call><|im_end|> | ||||
| ``` | ||||
|  | ||||
| The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.  | ||||
|  | ||||
| Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`. | ||||
|  | ||||
| > [!WARNING] | ||||
| > The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict. | ||||
|  | ||||
| <hfoptions id="tool-call"> | ||||
| <hfoption id="Llama"> | ||||
|  | ||||
| ```py | ||||
| tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} | ||||
| messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]}) | ||||
| ``` | ||||
|  | ||||
| Allow the assistant to read the function outputs and chat with the user. | ||||
|  | ||||
| ```py | ||||
| inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") | ||||
| inputs = {k: v for k, v in inputs.items()} | ||||
| out = model.generate(**inputs, max_new_tokens=128) | ||||
| print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) | ||||
| ``` | ||||
|  | ||||
| ```txt | ||||
| The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|> | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="Mistral/Mixtral"> | ||||
|  | ||||
| For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary. | ||||
|  | ||||
| ```py | ||||
| tool_call_id = "9Ae3bDc2F" | ||||
| tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} | ||||
| messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]}) | ||||
| ``` | ||||
|  | ||||
| ```py | ||||
| inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") | ||||
| inputs = {k: v for k, v in inputs.items()} | ||||
| out = model.generate(**inputs, max_new_tokens=128) | ||||
| print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
|  | ||||
| ## Schema | ||||
|  | ||||
| [`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the function works technically, it only cares about function **definition** and **arguments**. | ||||
|  | ||||
| The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging. | ||||
|  | ||||
| ```py | ||||
| from transformers.utils import get_json_schema | ||||
|  | ||||
| def multiply(a: float, b: float): | ||||
|     """ | ||||
|     A function that multiplies two numbers | ||||
|      | ||||
|     Args: | ||||
|         a: The first number to multiply | ||||
|         b: The second number to multiply | ||||
|     """ | ||||
|     return a * b | ||||
|  | ||||
| schema = get_json_schema(multiply) | ||||
| print(schema) | ||||
| ``` | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "type": "function",  | ||||
|   "function": { | ||||
|     "name": "multiply",  | ||||
|     "description": "A function that multiplies two numbers",  | ||||
|     "parameters": { | ||||
|       "type": "object",  | ||||
|       "properties": { | ||||
|         "a": { | ||||
|           "type": "number",  | ||||
|           "description": "The first number to multiply" | ||||
|         },  | ||||
|         "b": { | ||||
|           "type": "number", | ||||
|           "description": "The second number to multiply" | ||||
|         } | ||||
|       },  | ||||
|       "required": ["a", "b"] | ||||
|     } | ||||
|   } | ||||
| } | ||||
| ``` | ||||
|  | ||||
| You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions. | ||||
|  | ||||
| > [!WARNING] | ||||
| > Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments. | ||||
|  | ||||
| The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`]. | ||||
|  | ||||
| ```py | ||||
| # A simple function that takes no arguments | ||||
| current_time = { | ||||
|   "type": "function",  | ||||
|   "function": { | ||||
|     "name": "current_time", | ||||
|     "description": "Get the current local time as a string.", | ||||
|     "parameters": { | ||||
|       'type': 'object', | ||||
|       'properties': {} | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| # A more complete function that takes two numerical arguments | ||||
| multiply = { | ||||
|   'type': 'function', | ||||
|   'function': { | ||||
|     'name': 'multiply', | ||||
|     'description': 'A function that multiplies two numbers',  | ||||
|     'parameters': { | ||||
|       'type': 'object',  | ||||
|       'properties': { | ||||
|         'a': { | ||||
|           'type': 'number', | ||||
|           'description': 'The first number to multiply' | ||||
|         },  | ||||
|         'b': { | ||||
|           'type': 'number', 'description': 'The second number to multiply' | ||||
|         } | ||||
|       },  | ||||
|       'required': ['a', 'b'] | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| model_input = tokenizer.apply_chat_template( | ||||
|     messages, | ||||
|     tools = [current_time, multiply] | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| ## RAG | ||||
|  | ||||
| Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys. | ||||
|  | ||||
| > [!TIP] | ||||
| > The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates. | ||||
|  | ||||
| Create a list of documents to pass to the model. | ||||
|  | ||||
| ```py | ||||
| documents = [ | ||||
|     { | ||||
|         "title": "The Moon: Our Age-Old Foe",  | ||||
|         "text": "Man has always dreamed of destroying the moon. In this essay, I shall..." | ||||
|     }, | ||||
|     { | ||||
|         "title": "The Sun: Our Age-Old Friend", | ||||
|         "text": "Although often underappreciated, the sun provides several notable benefits..." | ||||
|     } | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response. | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoTokenizer, AutoModelForCausalLM | ||||
|  | ||||
| # Load the model and tokenizer | ||||
| tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit") | ||||
| model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto") | ||||
| device = model.device # Get the device the model is loaded on | ||||
|  | ||||
| # Define conversation input | ||||
| conversation = [ | ||||
|     {"role": "user", "content": "What has Man always dreamed of?"} | ||||
| ] | ||||
|  | ||||
| input_ids = tokenizer.apply_chat_template( | ||||
|     conversation=conversation, | ||||
|     documents=documents, | ||||
|     chat_template="rag", | ||||
|     tokenize=True, | ||||
|     add_generation_prompt=True, | ||||
|     return_tensors="pt").to(device) | ||||
|  | ||||
| # Generate a response  | ||||
| generated_tokens = model.generate( | ||||
|     input_ids, | ||||
|     max_new_tokens=100, | ||||
|     do_sample=True, | ||||
|     temperature=0.3, | ||||
|     ) | ||||
|  | ||||
| # Decode and print the generated text along with generation prompt | ||||
| generated_text = tokenizer.decode(generated_tokens[0]) | ||||
| print(generated_text) | ||||
| ``` | ||||
							
								
								
									
										463
									
								
								docs/source/en/chat_template_advanced.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										463
									
								
								docs/source/en/chat_template_advanced.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,463 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Advanced Usage and Customizing Your Chat Templates | ||||
|  | ||||
| In this page, we’ll explore more advanced techniques for working with chat templates in Transformers. Whether you’re looking to write your own templates, create custom components, or optimize your templates for efficiency, we’ll cover everything you need to take your templates to the next level. Let’s dive into the tools and strategies that will help you get the most out of your chat models. | ||||
|  | ||||
|  | ||||
| ## How do chat templates work? | ||||
|  | ||||
| The chat template for a model is stored on the `tokenizer.chat_template` attribute. Let's take a look at a `Zephyr` chat template, though note this | ||||
| one is a little simplified from the actual one! | ||||
|  | ||||
| ``` | ||||
| {%- for message in messages %} | ||||
|     {{- '<|' + message['role'] + '|>\n' }} | ||||
|     {{- message['content'] + eos_token }} | ||||
| {%- endfor %} | ||||
| {%- if add_generation_prompt %} | ||||
|     {{- '<|assistant|>\n' }} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| If you've never seen one of these before, this is a [Jinja template](https://jinja.palletsprojects.com/en/3.1.x/templates/). | ||||
| Jinja is a templating language that allows you to write simple code that generates text. In many ways, the code and | ||||
| syntax resembles Python. In pure Python, this template would look something like this: | ||||
|  | ||||
| ```python | ||||
| for message in messages: | ||||
|     print(f'<|{message["role"]}|>') | ||||
|     print(message['content'] + eos_token) | ||||
| if add_generation_prompt: | ||||
|     print('<|assistant|>') | ||||
| ``` | ||||
|  | ||||
| Effectively, the template does three things: | ||||
| 1. For each message, print the role enclosed in `<|` and `|>`, like `<|user|>` or `<|assistant|>`. | ||||
| 2. Next, print the content of the message, followed by the end-of-sequence token. | ||||
| 3. Finally, if `add_generation_prompt` is set, print the assistant token, so that the model knows to start generating | ||||
|    an assistant response. | ||||
|  | ||||
| This is a pretty simple template but Jinja gives you a lot of flexibility to do more complex things! Let's see a Jinja | ||||
| template that can format inputs similarly to the way LLaMA formats them (note that the real LLaMA template includes  | ||||
| handling for default system messages and slightly different system message handling in general - don't use this one  | ||||
| in your actual code!) | ||||
|  | ||||
| ``` | ||||
| {%- for message in messages %} | ||||
|     {%- if message['role'] == 'user' %} | ||||
|         {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }} | ||||
|     {%- elif message['role'] == 'system' %} | ||||
|         {{- '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }} | ||||
|     {%- elif message['role'] == 'assistant' %} | ||||
|         {{- ' '  + message['content'] + ' ' + eos_token }} | ||||
|     {%- endif %} | ||||
| {%- endfor %} | ||||
| ``` | ||||
|  | ||||
| Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens like | ||||
| `[INST]` and `[/INST]` based on the role of each message. User, assistant and system messages are clearly | ||||
| distinguishable to the model because of the tokens they're wrapped in. | ||||
|  | ||||
|  | ||||
| ## How do I create a chat template? | ||||
|  | ||||
| Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an  | ||||
| existing template from another model and simply edit it for your needs! For example, we could take the LLaMA template | ||||
| above and add "[ASST]" and "[/ASST]" to assistant messages: | ||||
|  | ||||
| ``` | ||||
| {%- for message in messages %} | ||||
|     {%- if message['role'] == 'user' %} | ||||
|         {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} | ||||
|     {%- elif message['role'] == 'system' %} | ||||
|         {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }} | ||||
|     {%- elif message['role'] == 'assistant' %} | ||||
|         {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }} | ||||
|     {%- endif %} | ||||
| {%- endfor %} | ||||
| ``` | ||||
|  | ||||
| Now, simply set the `tokenizer.chat_template` attribute. Next time you use [`~PreTrainedTokenizer.apply_chat_template`], it will | ||||
| use your new template! This attribute will be saved in the `tokenizer_config.json` file, so you can use | ||||
| [`~utils.PushToHubMixin.push_to_hub`] to upload your new template to the Hub and make sure everyone's using the right | ||||
| template for your model! | ||||
|  | ||||
| ```python | ||||
| template = tokenizer.chat_template | ||||
| template = template.replace("SYS", "SYSTEM")  # Change the system token | ||||
| tokenizer.chat_template = template  # Set the new template | ||||
| tokenizer.push_to_hub("model_name")  # Upload your new template to the Hub! | ||||
| ``` | ||||
|  | ||||
| The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`TextGenerationPipeline`] class, so  | ||||
| once you set the correct chat template, your model will automatically become compatible with [`TextGenerationPipeline`]. | ||||
|  | ||||
| <Tip> | ||||
| If you're fine-tuning a model for chat, in addition to setting a chat template, you should probably add any new chat | ||||
| control tokens as special tokens in the tokenizer. Special tokens are never split,  | ||||
| ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. You  | ||||
| should also set the tokenizer's `eos_token` attribute to the token that marks the end of assistant generations in your | ||||
| template. This will ensure that text generation tools can correctly figure out when to stop generating text. | ||||
| </Tip> | ||||
|  | ||||
|  | ||||
| ## Why do some models have multiple templates? | ||||
|  | ||||
| Some models use different templates for different use cases. For example, they might use one template for normal chat | ||||
| and another for tool-use, or retrieval-augmented generation. In these cases, `tokenizer.chat_template` is a dictionary. | ||||
| This can cause some confusion, and where possible, we recommend using a single template for all use-cases. You can use | ||||
| Jinja statements like `if tools is defined` and `{% macro %}` definitions to easily wrap multiple code paths in a | ||||
| single template. | ||||
|  | ||||
| When a tokenizer has multiple templates, `tokenizer.chat_template` will be a `dict`, where each key is the name | ||||
| of a template. The `apply_chat_template` method has special handling for certain template names: Specifically, it will | ||||
| look for a template named `default` in most cases, and will raise an error if it can't find one. However, if a template | ||||
| named `tool_use` exists when the user has passed a `tools` argument, it will use that instead. To access templates | ||||
| with other names, pass the name of the template you want to the `chat_template` argument of | ||||
| `apply_chat_template()`. | ||||
|  | ||||
| We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend | ||||
| trying to put it all in a single template where possible! | ||||
|  | ||||
|  | ||||
| ## What template should I use? | ||||
|  | ||||
| When setting the template for a model that's already been trained for chat, you should ensure that the template | ||||
| exactly matches the message formatting that the model saw during training, or else you will probably experience | ||||
| performance degradation. This is true even if you're training the model further - you will probably get the best  | ||||
| performance if you keep the chat tokens constant. This is very analogous to tokenization - you generally get the | ||||
| best performance for inference or fine-tuning when you precisely match the tokenization used during training. | ||||
|  | ||||
| If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand, | ||||
| you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different | ||||
| input formats. One popular choice is the `ChatML` format, and this is a good, flexible choice for many use-cases.  | ||||
| It looks like this: | ||||
|  | ||||
| ``` | ||||
| {%- for message in messages %} | ||||
|     {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} | ||||
| {%- endfor %} | ||||
| ``` | ||||
|  | ||||
| If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes | ||||
| handy support for [generation prompts](#what-are-generation-prompts), but note that it doesn't add BOS or EOS tokens! | ||||
| If your model expects those, they won't be added automatically by `apply_chat_template` - in other words, the | ||||
| text will be tokenized with `add_special_tokens=False`. This is to avoid potential conflicts between the template and | ||||
| the `add_special_tokens` logic. If your model expects special tokens, make sure to add them to the template! | ||||
|  | ||||
| ```python | ||||
| tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" | ||||
| ``` | ||||
|  | ||||
| This template wraps each message in `<|im_start|>` and `<|im_end|>` tokens, and simply writes the role as a string, which | ||||
| allows for flexibility in the roles you train with. The output looks like this: | ||||
|  | ||||
| ```text | ||||
| <|im_start|>system | ||||
| You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> | ||||
| <|im_start|>user | ||||
| How are you?<|im_end|> | ||||
| <|im_start|>assistant | ||||
| I'm doing great!<|im_end|> | ||||
| ``` | ||||
|  | ||||
| The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense, | ||||
| particularly if you want your model to operate well with [`TextGenerationPipeline`]. However, you are not limited | ||||
| to these roles - templating is extremely flexible, and any string can be a role. | ||||
|  | ||||
| ## I want to add some chat templates! How should I get started? | ||||
|  | ||||
| If you have any chat models, you should set their `tokenizer.chat_template` attribute and test it using | ||||
| [`~PreTrainedTokenizer.apply_chat_template`], then push the updated tokenizer to the Hub. This applies even if you're | ||||
| not the model owner - if you're using a model with an empty chat template, or one that's still using the default class | ||||
| template, please open a [pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) to the model repository so that this attribute can be set properly! | ||||
|  | ||||
| Once the attribute is set, that's it, you're done! `tokenizer.apply_chat_template` will now work correctly for that | ||||
| model, which means it is also automatically supported in places like `TextGenerationPipeline`! | ||||
|  | ||||
| By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of | ||||
| open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long -  | ||||
| it's time to put an end to them! | ||||
|  | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| The easiest way to get started with writing Jinja templates is to take a look at some existing ones. You can use | ||||
| `print(tokenizer.chat_template)` for any chat model to see what template it's using. In general, models that support tool use have  | ||||
| much more complex templates than other models - so when you're just getting started, they're probably a bad example | ||||
| to learn from! You can also take a look at the  | ||||
| [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for details | ||||
| of general Jinja formatting and syntax. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| Jinja templates in `transformers` are identical to Jinja templates elsewhere. The main thing to know is that  | ||||
| the conversation history will be accessible inside your template as a variable called `messages`.   | ||||
| You will be able to access `messages` in your template just like you can in Python, which means you can loop over  | ||||
| it with `{% for message in messages %}` or access individual messages with `{{ messages[0] }}`, for example. | ||||
|  | ||||
| You can also use the following tips to write clean, efficient Jinja templates: | ||||
|  | ||||
| ### Trimming whitespace | ||||
|  | ||||
| By default, Jinja will print any whitespace that comes before or after a block. This can be a problem for chat | ||||
| templates, which generally want to be very precise with whitespace! To avoid this, we strongly recommend writing | ||||
| your templates like this: | ||||
|  | ||||
| ``` | ||||
| {%- for message in messages %} | ||||
|     {{- message['role'] + message['content'] }} | ||||
| {%- endfor %} | ||||
| ``` | ||||
|  | ||||
| rather than like this: | ||||
|  | ||||
| ``` | ||||
| {% for message in messages %} | ||||
|     {{ message['role'] + message['content'] }} | ||||
| {% endfor %} | ||||
| ``` | ||||
|  | ||||
| Adding `-` will strip any whitespace that comes before the block. The second example looks innocent, but the newline | ||||
| and indentation may end up being included in the output, which is probably not what you want! | ||||
|  | ||||
| ### Special variables | ||||
|  | ||||
| Inside your template, you will have access several special variables. The most important of these is `messages`,  | ||||
| which contains the chat history as a list of message dicts. However, there are several others. Not every | ||||
| variable will be used in every template. The most common other variables are: | ||||
|  | ||||
| - `tools` contains a list of tools in JSON schema format. Will be `None` or undefined if no tools are passed. | ||||
| - `documents` contains a list of documents in the format `{"title": "Title", "contents": "Contents"}`, used for retrieval-augmented generation. Will be `None` or undefined if no documents are passed. | ||||
| - `add_generation_prompt` is a bool that is `True` if the user has requested a generation prompt, and `False` otherwise. If this is set, your template should add the header for an assistant message to the end of the conversation. If your model doesn't have a specific header for assistant messages, you can ignore this flag. | ||||
| - **Special tokens** like `bos_token` and `eos_token`. These are extracted from `tokenizer.special_tokens_map`. The exact tokens available inside each template will differ depending on the parent tokenizer. | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| You can actually pass any `kwarg` to `apply_chat_template`, and it will be accessible inside the template as a variable. In general, | ||||
| we recommend trying to stick to the core variables above, as it will make your model harder to use if users have | ||||
| to write custom code to pass model-specific `kwargs`. However, we're aware that this field moves quickly, so if you | ||||
| have a new use-case that doesn't fit in the core API, feel free to use a new `kwarg` for it! If a new `kwarg` | ||||
| becomes common we may promote it into the core API and create a standard, documented format for it. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| ### Callable functions | ||||
|  | ||||
| There is also a short list of callable functions available to you inside your templates. These are: | ||||
|  | ||||
| - `raise_exception(msg)`: Raises a `TemplateException`. This is useful for debugging, and for telling users when they're | ||||
| doing something that your template doesn't support. | ||||
| - `strftime_now(format_str)`: Equivalent to `datetime.now().strftime(format_str)` in Python. This is used for getting | ||||
| the current date/time in a specific format, which is sometimes included in system messages. | ||||
|  | ||||
| ### Compatibility with non-Python Jinja | ||||
|  | ||||
| There are multiple implementations of Jinja in various languages. They generally have the same syntax, | ||||
| but a key difference is that when you're writing a template in Python you can use Python methods, such as | ||||
| `.lower()` on strings or `.items()` on dicts. This will break if someone tries to use your template on a non-Python | ||||
| implementation of Jinja. Non-Python implementations are particularly common in deployment environments, where JS | ||||
| and Rust are very popular.  | ||||
|  | ||||
| Don't panic, though! There are a few easy changes you can make to your templates to ensure they're compatible across | ||||
| all implementations of Jinja: | ||||
|  | ||||
| - Replace Python methods with Jinja filters. These usually have the same name, for example `string.lower()` becomes | ||||
|   `string|lower`, and `dict.items()` becomes `dict|items`. One notable change is that `string.strip()` becomes `string|trim`. | ||||
|   See the [list of built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) | ||||
|   in the Jinja documentation for more. | ||||
| - Replace `True`, `False` and `None`, which are Python-specific, with `true`, `false` and `none`. | ||||
| - Directly rendering a dict or list may give different results in other implementations (for example, string entries | ||||
|   might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here. | ||||
|  | ||||
| ### Writing generation prompts | ||||
|  | ||||
| We mentioned above that `add_generation_prompt` is a special variable that will be accessible inside your template, | ||||
| and is controlled by the user setting the `add_generation_prompt` flag. If your model expects a header for | ||||
| assistant messages, then your template must support adding the header when `add_generation_prompt` is set. | ||||
|  | ||||
| Here is an example of a template that formats messages ChatML-style, with generation prompt support: | ||||
|  | ||||
| ```text | ||||
| {{- bos_token }} | ||||
| {%- for message in messages %} | ||||
|     {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} | ||||
| {%- endfor %} | ||||
| {%- if add_generation_prompt %} | ||||
|     {{- '<|im_start|>assistant\n' }} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| The exact content of the assistant header will depend on your specific model, but it should always be **the string | ||||
| that represents the start of an assistant message**, so that if the user applies your template with  | ||||
| `add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some | ||||
| models do not need a generation prompt, because assistant messages always begin immediately after user messages.  | ||||
| This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]` | ||||
| token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag. | ||||
|  | ||||
| Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then | ||||
| model generations will likely be severely degraded, or the model may display unusual behaviour like continuing  | ||||
| the final user message!  | ||||
|  | ||||
| ### Writing and debugging larger templates | ||||
|  | ||||
| When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script.  | ||||
| However, with new models and features like tool-use and RAG, some templates can be 100 lines long or more. When | ||||
| writing templates like these, it's a good idea to write them in a separate file, using a text editor. You can easily  | ||||
| extract a chat template to a file: | ||||
|  | ||||
| ```python | ||||
| open("template.jinja", "w").write(tokenizer.chat_template) | ||||
| ``` | ||||
|  | ||||
| Or load the edited template back into the tokenizer: | ||||
|  | ||||
| ```python | ||||
| tokenizer.chat_template = open("template.jinja").read() | ||||
| ``` | ||||
|  | ||||
| As an added bonus, when you write a long, multi-line template in a separate file, line numbers in that file will | ||||
| exactly correspond to line numbers in template parsing or execution errors. This will make it much easier to | ||||
| identify the source of issues. | ||||
|  | ||||
|  | ||||
|  | ||||
| ## Writing templates for tools | ||||
|  | ||||
| Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend  | ||||
| template authors try to stick to a standard API where possible. The whole point of chat templates is to allow code | ||||
| to be transferable across models, so deviating from the standard tools API means users will have to write | ||||
| custom code to use tools with your model. Sometimes it's unavoidable, but often with clever templating you can | ||||
| make the standard API work! | ||||
|  | ||||
| Below, we'll list the elements of the standard API, and give tips on writing templates that will work well with it. | ||||
|  | ||||
| ### Tool definitions | ||||
|  | ||||
| Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list  | ||||
| of JSON schema dicts. Our chat template methods allow users to pass tools as either JSON schema or Python functions, but when | ||||
| functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the  | ||||
| `tools` variable that your template receives will always be a list of JSON schema. Here is | ||||
| a sample tool JSON schema: | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "type": "function",  | ||||
|   "function": { | ||||
|     "name": "multiply",  | ||||
|     "description": "A function that multiplies two numbers",  | ||||
|     "parameters": { | ||||
|       "type": "object",  | ||||
|       "properties": { | ||||
|         "a": { | ||||
|           "type": "number",  | ||||
|           "description": "The first number to multiply" | ||||
|         },  | ||||
|         "b": { | ||||
|           "type": "number", | ||||
|           "description": "The second number to multiply" | ||||
|         } | ||||
|       },  | ||||
|       "required": ["a", "b"] | ||||
|     } | ||||
|   } | ||||
| } | ||||
| ``` | ||||
|  | ||||
| And here is some example code for handling tools in your chat template. Remember, this is just an example for a | ||||
| specific format - your model will probably need different formatting! | ||||
|  | ||||
| ```text | ||||
| {%- if tools %} | ||||
|     {%- for tool in tools %} | ||||
|         {{- '<tool>' + tool['function']['name'] + '\n' }} | ||||
|         {%- for argument in tool['function']['parameters']['properties'] %} | ||||
|             {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }} | ||||
|         {%- endfor %} | ||||
|         {{- '\n</tool>' }} | ||||
|     {%- endif %} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| The specific tokens and tool descriptions your template renders should of course be chosen to match the ones your model | ||||
| was trained with. There is no requirement that your **model** understands JSON schema input, only that your template can translate | ||||
| JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024)  | ||||
| was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema,  | ||||
| converts types internally and renders the input tools as Python headers. You can do a lot with templates! | ||||
|  | ||||
| ### Tool calls | ||||
|  | ||||
| Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is  | ||||
| always a list, even though most tool-calling models only support single tool calls at a time, which means | ||||
| the list will usually only have a single element. Here is a sample message dict containing a tool call: | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "role": "assistant", | ||||
|   "tool_calls": [ | ||||
|     { | ||||
|       "type": "function", | ||||
|       "function": { | ||||
|         "name": "multiply", | ||||
|         "arguments": { | ||||
|           "a": 5, | ||||
|           "b": 6 | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|   ] | ||||
| } | ||||
| ``` | ||||
|  | ||||
| And a common pattern for handling them would be something like this: | ||||
|  | ||||
| ```text | ||||
| {%- if message['role'] == 'assistant' and 'tool_calls' in message %} | ||||
|     {%- for tool_call in message['tool_calls'] %} | ||||
|             {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }} | ||||
|         {%- endif %} | ||||
|     {%- endfor %} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| Again, you should render the tool call with the formatting and special tokens that your model expects. | ||||
|  | ||||
| ### Tool responses | ||||
|  | ||||
| Tool responses have a simple format: They are a message dict with the "tool" role, a "name" key giving the name | ||||
| of the called function, and a "content" key containing the result of the tool call. Here is a sample tool response: | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "role": "tool", | ||||
|   "name": "multiply", | ||||
|   "content": "30" | ||||
| } | ||||
| ``` | ||||
|  | ||||
| You don't need to use all of the keys in the tool response. For example, if your model doesn't expect the function | ||||
| name to be included in the tool response, then rendering it can be as simple as: | ||||
|  | ||||
| ```text | ||||
| {%- if message['role'] == 'tool' %} | ||||
|     {{- "<tool_result>" + message['content'] + "</tool_result>" }} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care | ||||
| to ensure that tokens, whitespace and everything else exactly match the format your model was trained with! | ||||
							
								
								
									
										287
									
								
								docs/source/en/chat_template_basics.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										287
									
								
								docs/source/en/chat_template_basics.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,287 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Getting Started with Chat Templates for Text LLMs | ||||
|  | ||||
| An increasingly common use case for LLMs is **chat**. In a chat context, rather than continuing a single string | ||||
| of text (as is the case with a standard language model), the model instead continues a conversation that consists | ||||
| of one or more **messages**, each of which includes a **role**, like "user" or "assistant", as well as message text. | ||||
|  | ||||
| Much like tokenization, different models expect very different input formats for chat. This is the reason we added | ||||
| **chat templates** as a feature. Chat templates are part of the tokenizer for text-only LLMs or processor for multimodal LLMs. They specify how to convert conversations, represented as lists of messages, into a single tokenizable string in the format that the model expects.  | ||||
|  | ||||
| We'll explore the basic usage of chat templates with text-only LLMs in this page. For detailed guidance on multimodal models, we have a dedicated [documentation oage for multimodal models](./chat_template_multimodal), which covers how to work with image, video and audio inputs in your templates. | ||||
|  | ||||
| Let's make this concrete with a quick example using the `mistralai/Mistral-7B-Instruct-v0.1` model: | ||||
|  | ||||
| ```python | ||||
| >>> from transformers import AutoTokenizer | ||||
| >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") | ||||
|  | ||||
| >>> chat = [ | ||||
| ...   {"role": "user", "content": "Hello, how are you?"}, | ||||
| ...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, | ||||
| ...   {"role": "user", "content": "I'd like to show off how chat templating works!"}, | ||||
| ... ] | ||||
|  | ||||
| >>> tokenizer.apply_chat_template(chat, tokenize=False) | ||||
| "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]" | ||||
| ``` | ||||
|  | ||||
| Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of  | ||||
| user messages (but not assistant messages!), and the entire chat is condensed into a single string.  | ||||
| If we use `tokenize=True`, which is the default setting, that string will also be tokenized for us. | ||||
|  | ||||
| Now, try the same code, but swap in the `HuggingFaceH4/zephyr-7b-beta` model instead, and you should get: | ||||
|  | ||||
| ```text | ||||
| <|user|> | ||||
| Hello, how are you?</s> | ||||
| <|assistant|> | ||||
| I'm doing great. How can I help you today?</s> | ||||
| <|user|> | ||||
| I'd like to show off how chat templating works!</s> | ||||
| ``` | ||||
|  | ||||
| Both Zephyr and Mistral-Instruct were fine-tuned from the same base model, `Mistral-7B-v0.1`. However, they were trained | ||||
| with totally different chat formats. Without chat templates, you would have to write manual formatting code for each | ||||
| model, and it's very easy to make minor errors that hurt performance! Chat templates handle the details of formatting  | ||||
| for you, allowing you to write universal code that works for any model. | ||||
|  | ||||
|  | ||||
| ## How do I use chat templates? | ||||
|  | ||||
| As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role` | ||||
| and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] or [`~ProcessorMixin.apply_chat_template`] method | ||||
| depending on what type of model you are using. Once you do that, | ||||
| you'll get output that's ready to go! When using chat templates as input for model generation, it's also a good idea | ||||
| to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts).  | ||||
|  | ||||
| Here's an example of preparing input for `model.generate()`, using `Zephyr` again: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoModelForCausalLM, AutoTokenizer | ||||
|  | ||||
| checkpoint = "HuggingFaceH4/zephyr-7b-beta" | ||||
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | ||||
| model = AutoModelForCausalLM.from_pretrained(checkpoint)  # You may want to use bfloat16 and/or move to GPU here | ||||
|  | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": "You are a friendly chatbot who always responds in the style of a pirate", | ||||
|     }, | ||||
|     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, | ||||
|  ] | ||||
| tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") | ||||
| print(tokenizer.decode(tokenized_chat[0])) | ||||
| ``` | ||||
| This will yield a string in the input format that Zephyr expects.  | ||||
| ```text | ||||
| <|system|> | ||||
| You are a friendly chatbot who always responds in the style of a pirate</s>  | ||||
| <|user|> | ||||
| How many helicopters can a human eat in one sitting?</s>  | ||||
| <|assistant|> | ||||
| ``` | ||||
|  | ||||
| Now that our input is formatted correctly for Zephyr, we can use the model to generate a response to the user's question: | ||||
|  | ||||
| ```python | ||||
| outputs = model.generate(tokenized_chat, max_new_tokens=128)  | ||||
| print(tokenizer.decode(outputs[0])) | ||||
| ``` | ||||
|  | ||||
| This will yield: | ||||
|  | ||||
| ```text | ||||
| <|system|> | ||||
| You are a friendly chatbot who always responds in the style of a pirate</s>  | ||||
| <|user|> | ||||
| How many helicopters can a human eat in one sitting?</s>  | ||||
| <|assistant|> | ||||
| Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all. | ||||
| ``` | ||||
|  | ||||
| Arr, 'twas easy after all! | ||||
|  | ||||
|  | ||||
| ## Is there an automated pipeline for chat? | ||||
|  | ||||
| Yes, there is! Our text generation pipelines support chat inputs, which makes it easy to use chat models. In the past, | ||||
| we used to use a dedicated "ConversationalPipeline" class, but this has now been deprecated and its functionality | ||||
| has been merged into the [`TextGenerationPipeline`]. Let's try the `Zephyr` example again, but this time using  | ||||
| a pipeline: | ||||
|  | ||||
| ```python | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta") | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": "You are a friendly chatbot who always responds in the style of a pirate", | ||||
|     }, | ||||
|     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, | ||||
| ] | ||||
| print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])  # Print the assistant's response | ||||
| ``` | ||||
|  | ||||
| ```text | ||||
| {'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all."} | ||||
| ``` | ||||
|  | ||||
| The pipeline will take care of all the details of tokenization and calling `apply_chat_template` for you - | ||||
| once the model has a chat template, all you need to do is initialize the pipeline and pass it the list of messages! | ||||
|  | ||||
|  | ||||
| ## What are "generation prompts"? | ||||
|  | ||||
| You may have noticed that the `apply_chat_template` method has an `add_generation_prompt` argument. This argument tells | ||||
| the template to add tokens that indicate the start of a bot response. For example, consider the following chat: | ||||
|  | ||||
| ```python | ||||
| messages = [ | ||||
|     {"role": "user", "content": "Hi there!"}, | ||||
|     {"role": "assistant", "content": "Nice to meet you!"}, | ||||
|     {"role": "user", "content": "Can I ask a question?"} | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Here's what this will look like without a generation prompt, for a model that uses standard "ChatML" formatting: | ||||
|  | ||||
| ```python | ||||
| tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) | ||||
| """<|im_start|>user | ||||
| Hi there!<|im_end|> | ||||
| <|im_start|>assistant | ||||
| Nice to meet you!<|im_end|> | ||||
| <|im_start|>user | ||||
| Can I ask a question?<|im_end|> | ||||
| """ | ||||
| ``` | ||||
|  | ||||
| And here's what it looks like **with** a generation prompt: | ||||
|  | ||||
| ```python | ||||
| tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | ||||
| """<|im_start|>user | ||||
| Hi there!<|im_end|> | ||||
| <|im_start|>assistant | ||||
| Nice to meet you!<|im_end|> | ||||
| <|im_start|>user | ||||
| Can I ask a question?<|im_end|> | ||||
| <|im_start|>assistant | ||||
| """ | ||||
| ``` | ||||
|  | ||||
| Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model | ||||
| generates text it will write a bot response instead of doing something unexpected, like continuing the user's  | ||||
| message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a  | ||||
| special kind of text to them! You need to guide them with appropriate control tokens, so they know what they're  | ||||
| supposed to be doing. | ||||
|  | ||||
| Not all models require generation prompts. Some models, like LLaMA, don't have any | ||||
| special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact | ||||
| effect that `add_generation_prompt` has will depend on the template being used. | ||||
|  | ||||
|  | ||||
| ## What does "continue_final_message" do? | ||||
|  | ||||
| When passing a list of messages to `apply_chat_template` or `TextGenerationPipeline`, you can choose | ||||
| to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done | ||||
| by removing any end-of-sequence tokens that indicate the end of the final message, so that the model will simply | ||||
| extend the final message when it begins to generate text. This is useful for "prefilling" the model's response.  | ||||
|  | ||||
| Here's an example: | ||||
|  | ||||
| ```python | ||||
| chat = [ | ||||
|     {"role": "user", "content": "Can you format the answer in JSON?"}, | ||||
|     {"role": "assistant", "content": '{"name": "'}, | ||||
| ] | ||||
|  | ||||
| formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True) | ||||
| model.generate(**formatted_chat) | ||||
| ``` | ||||
|  | ||||
| The model will generate text that continues the JSON string, rather than starting a new message. This approach | ||||
| can be very useful for improving the accuracy of the model's instruction-following when you know how you want | ||||
| it to start its replies. | ||||
|  | ||||
| Because `add_generation_prompt` adds the tokens that start a new message, and `continue_final_message` removes any | ||||
| end-of-message tokens from the final message, it does not make sense to use them together. As a result, you'll | ||||
| get an error if you try! | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| The default behaviour of `TextGenerationPipeline` is to set `add_generation_prompt=True` so that it starts a new | ||||
| message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is  | ||||
| a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple  | ||||
| consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message`  | ||||
| argument when calling the pipeline. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
|  | ||||
| ## Can I use chat templates in training? | ||||
|  | ||||
| Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training. | ||||
| We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you | ||||
| can simply continue like any other language model training task. When training, you should usually set  | ||||
| `add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during  | ||||
| training. Let's see an example: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoTokenizer | ||||
| from datasets import Dataset | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") | ||||
|  | ||||
| chat1 = [ | ||||
|     {"role": "user", "content": "Which is bigger, the moon or the sun?"}, | ||||
|     {"role": "assistant", "content": "The sun."} | ||||
| ] | ||||
| chat2 = [ | ||||
|     {"role": "user", "content": "Which is bigger, a virus or a bacterium?"}, | ||||
|     {"role": "assistant", "content": "A bacterium."} | ||||
| ] | ||||
|  | ||||
| dataset = Dataset.from_dict({"chat": [chat1, chat2]}) | ||||
| dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) | ||||
| print(dataset['formatted_chat'][0]) | ||||
| ``` | ||||
| And we get: | ||||
| ```text | ||||
| <|user|> | ||||
| Which is bigger, the moon or the sun?</s> | ||||
| <|assistant|> | ||||
| The sun.</s> | ||||
| ``` | ||||
|  | ||||
| From here, just continue training like you would with a standard language modelling task, using the `formatted_chat` column. | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| By default, some tokenizers add special tokens like `<bos>` and `<eos>` to text they tokenize. Chat templates should  | ||||
| already include all the special tokens they need, and so additional special tokens will often be incorrect or  | ||||
| duplicated, which will hurt model performance. | ||||
|  | ||||
| Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument | ||||
| `add_special_tokens=False` when you tokenize that text later. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this! | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
							
								
								
									
										289
									
								
								docs/source/en/chat_template_multimodal.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										289
									
								
								docs/source/en/chat_template_multimodal.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,289 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Multimodal Chat Templates for Vision and Audio LLMs | ||||
|  | ||||
| In this section, we'll explore how to use chat templates with multimodal models, enabling your templates to handle a variety of inputs such as text, images, and audio. Multimodal models provide richer, more interactive experiences, and understanding how to effectively combine these inputs within your templates is key. We’ll walk through how to work with different modalities, configure your templates for optimal performance, and tackle common challenges along the way. | ||||
|  | ||||
| Just like with text-only LLMs, multimodal models expect a chat with **messages**, each of which includes a **role** and **content**. However, for multimodal models, chat templates are a part of the [Processor](./main_cllasses/processors) class. Let's see how we can format our prompts when there are images or videos in the input along with text. | ||||
|  | ||||
|  | ||||
| ## Image inputs | ||||
|  | ||||
| For models such as [LLaVA](https://huggingface.co/llava-hf) the prompts can be formatted as below. Notice that the only difference from text-only models is that we need to also pass a placeholder for input images. To accommodate for extra modalities, each **content** is a list containing either a text or an image **type**. | ||||
|  | ||||
| Let's make this concrete with a quick example using the `llava-hf/llava-onevision-qwen2-0.5b-ov-hf` model: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration | ||||
|  | ||||
| model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" | ||||
| processor = AutoProcessor.from_pretrained(model_id) | ||||
|  | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "image"}, | ||||
|             {"type": "text", "text": "What are these?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
|  | ||||
| formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) | ||||
| print(formatted_prompt) | ||||
| ``` | ||||
|  | ||||
| This yields a string in LLaVA's expected input format with many `<image>` tokens prepended before the text. | ||||
| ```text | ||||
| '<|im_start|>system  | ||||
| <|im_start|>system  | ||||
| You are a friendly chatbot who always responds in the style of a pirate<|im_end|><|im_start|>user <image> | ||||
| What are these?<|im_end|> | ||||
| ``` | ||||
|  | ||||
|  | ||||
| ### Image paths or URLs | ||||
|  | ||||
| To incorporate images into your chat templates, you can pass them as file paths or URLs. This method automatically loads the image, processes it, and prepares the necessary pixel values to create ready-to-use inputs for the model. This approach simplifies the integration of images, enabling seamless multimodal functionality. | ||||
|  | ||||
| Let's see how it works with an example using the same model as above. This time we'll indicate an image URL with `"url"` key in the message's **content** and ask the chat template to `tokenize` and `return_dict`. Currently, "base64", "url", and "path" are supported image sources. | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration | ||||
|  | ||||
| model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" | ||||
| model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) | ||||
| processor = AutoProcessor.from_pretrained(model_id) | ||||
|  | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, | ||||
|             {"type": "text", "text": "What are these?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
|  | ||||
| processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt") | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
| This yields a dictionary with inputs processed and ready to be further passed into [`~GenerationMixin.generate`] to generate text. | ||||
| ```text | ||||
| dict_keys(["input_ids", "attention_mask", "pixel_values", "image_sizes"]) | ||||
| ``` | ||||
|  | ||||
|  | ||||
| ## Video inputs | ||||
|  | ||||
| Some vision models support videos as inputs as well as images. The message format is very similar to the image-only models with tiny differences to handle loading videos from a URL. We can continue using the same model as before since it supports videos. | ||||
|  | ||||
| ### Sampling with fixed number of frames | ||||
|  | ||||
| Here's an example of how to set up a conversation with video inputs. Notice the extra `kwargs` passed to `processor.apply_chat_template()`. The key parameter here is `num_frames`, which controls how many frames to sample uniformly from the video. Each model checkpoint has a maximum frame count it was trained with, and exceeding this limit can significantly impact generation quality. So, it’s important to choose a frame count that fits both the model's capacity and your computational resources. If you don't specify `num_frames`, the entire video will be loaded without any frame sampling. | ||||
|  | ||||
| You also have the option to choose a specific framework to load the video, depending on your preferences or needs. Currently, we support `decord`, `pyav` (the default), `opencv`, and `torchvision`. For this example, we’ll use `decord`, as it's a bit faster than `pyav`. | ||||
|  | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| Note that if you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration | ||||
|  | ||||
| model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" | ||||
| model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) | ||||
| processor = AutoProcessor.from_pretrained(model_id) | ||||
|  | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"}, | ||||
|             {"type": "text", "text": "What do you see in this video?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
|  | ||||
| processed_chat = processor.apply_chat_template( | ||||
|     messages, | ||||
|     add_generation_prompt=True, | ||||
|     tokenize=True, | ||||
|     return_dict=True, | ||||
|     return_tensors="pt", | ||||
|     num_frames=32, | ||||
|     video_load_backend="decord", | ||||
| ) | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
| ### Sampling with FPS | ||||
|  | ||||
| When working with long videos, you might want to sample more frames for better representation. Instead of a fixed number of frames, you can specify `video_fps`, which determines how many frames per second to extract. For example, if a video is **10 seconds long** and you set `video_fps=2`, the model will sample **20 frames** (2 per second, uniformly spaced).  | ||||
|  | ||||
| Using the above model, we need to apply chat template as follows to sample 2 frames per second. | ||||
|  | ||||
| ```python | ||||
| processed_chat = processor.apply_chat_template( | ||||
|     messages, | ||||
|     add_generation_prompt=True, | ||||
|     tokenize=True, | ||||
|     return_dict=True, | ||||
|     video_fps=32, | ||||
|     video_load_backend="decord", | ||||
| ) | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
|  | ||||
| ### Custom Frame Sampling with a Function   | ||||
|  | ||||
| Not all models sample frames **uniformly** — some require more complex logic to determine which frames to use. If your model follows a different sampling strategy, you can **customize** frame selection by providing a function:   | ||||
|  | ||||
| 🔹 Use the `sample_indices_fn` argument to pass a **callable function** for sampling.   | ||||
| 🔹 If provided, this function **overrides** standard `num_frames` and `fps` methods.   | ||||
| 🔹 It receives all the arguments passed to `load_video` and must return **valid frame indices** to sample.   | ||||
|  | ||||
| You should use `sample_indices_fn` when: | ||||
|  | ||||
| - If you need a custom sampling strategy (e.g., **adaptive frame selection** instead of uniform sampling).   | ||||
| - If your model prioritizes **key moments** in a video rather than evenly spaced frames.   | ||||
|  | ||||
| Here’s an example of how to implement it:   | ||||
|  | ||||
|  | ||||
| ```python | ||||
|  | ||||
| def sample_indices_fn(metadata, **kwargs): | ||||
|     # samples only the first and the second frame | ||||
|     return [0, 1] | ||||
|  | ||||
| processed_chat = processor.apply_chat_template( | ||||
|     messages, | ||||
|     add_generation_prompt=True, | ||||
|     tokenize=True, | ||||
|     return_dict=True, | ||||
|     sample_indices_fn=sample_indices_fn, | ||||
|     video_load_backend="decord", | ||||
| ) | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
| By using `sample_indices_fn`, you gain **full control** over frame selection, making your model **more adaptable** to different video scenarios. 🚀   | ||||
|  | ||||
|  | ||||
| ### List of image frames as video | ||||
|  | ||||
| Sometimes, instead of having a full video file, you might only have a set of sampled frames stored as images. | ||||
|  | ||||
| You can pass a list of image file paths, and the processor will automatically concatenate them into a video. Just make sure that all images have the same size, as they are assumed to be from the same video. | ||||
|  | ||||
|  | ||||
| ```python | ||||
| frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"] | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "video", "path": frames_paths}, | ||||
|             {"type": "text", "text": "What do you see in this video?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
|  | ||||
| processed_chat = processor.apply_chat_template( | ||||
|     messages, | ||||
|     add_generation_prompt=True, | ||||
|     tokenize=True, | ||||
|     return_dict=True, | ||||
| ) | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
|  | ||||
| ## Multimodal conversational pipeline | ||||
|  | ||||
| [`ImageTextToTextPipeline`] currently accepts images as inputs but we are planning to add support for video inputs in the future. The pipeline supports chat inputs in the same format as we have seen above. Apart from that, the pipeline will accept chats in OpenAI format. This format is supported exclusively within the pipeline to make inference easier and more accessible.  | ||||
|  | ||||
| Here is how the OpenAI conversation format looks: | ||||
|  | ||||
| ```python | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "user", | ||||
|         "content": [ | ||||
|             { | ||||
|                 "type": "text", | ||||
|                 "text": "What is in this image?", | ||||
|             }, | ||||
|             { | ||||
|                 "type": "image_url", | ||||
|                 "image_url": {"url": f"http://images.cocodataset.org/val2017/000000039769.jpg"}, | ||||
|             }, | ||||
|         ], | ||||
|     } | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| ## Best Practices for Multimodal Template Configuration | ||||
|  | ||||
|  | ||||
| To add a custom chat template for your multimodal LLM, simply create your template using [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with `processor.chat_template`. If you're new to writing chat templates or need some tips, check out our [tutorial here](./chat_template_advanced) for helpful guidance. | ||||
|  | ||||
| In some cases, you may want your template to handle a **list of content** from multiple modalities, while still supporting a plain string for text-only inference. Here's an example of how you can achieve that, using the [Llama-Vision](https://huggingface.co/collections/meta-llama/metas-llama-32-multimodal-models-675bfd70e574a62dd0e4059b) chat template. | ||||
|  | ||||
|  | ||||
| ``` | ||||
| {% for message in messages %} | ||||
| {% if loop.index0 == 0 %}{{ bos_token }}{% endif %} | ||||
| {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} | ||||
| {% if message['content'] is string %} | ||||
| {{ message['content'] }} | ||||
| {% else %} | ||||
| {% for content in message['content'] %} | ||||
| {% if content['type'] == 'image' %} | ||||
| {{ '<|image|>' }} | ||||
| {% elif content['type'] == 'text' %} | ||||
| {{ content['text'] }} | ||||
| {% endif %} | ||||
| {% endfor %} | ||||
| {% endif %} | ||||
| {{ '<|eot_id|>' }} | ||||
| {% endfor %} | ||||
| {% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %} | ||||
| ``` | ||||
							
								
								
									
										410
									
								
								docs/source/en/chat_template_tools_and_documents.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										410
									
								
								docs/source/en/chat_template_tools_and_documents.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,410 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
|  | ||||
| # Expanding Chat Templates with Tools and Documents | ||||
|  | ||||
| The only argument that `apply_chat_template` requires is `messages`. However, you can pass any keyword | ||||
| argument to `apply_chat_template` and it will be accessible inside the template. This gives you a lot of freedom to use | ||||
| chat templates for many things. There are no restrictions on the names or the format of these arguments - you can pass | ||||
| strings, lists, dicts or whatever else you want.  | ||||
|  | ||||
| That said, there are some common use-cases for these extra arguments, | ||||
| such as passing tools for function calling, or documents for retrieval-augmented generation. In these common cases, | ||||
| we have some opinionated recommendations about what the names and formats of these arguments should be, which are | ||||
| described in the sections below. We encourage model authors to make their chat templates compatible with this format, | ||||
| to make it easy to transfer tool-calling code between models. | ||||
|  | ||||
| ## Tool use / function calling | ||||
|  | ||||
| "Tool use" LLMs can choose to call functions as external tools before generating an answer. When passing tools | ||||
| to a tool-use model, you can simply pass a list of functions to the `tools` argument: | ||||
|  | ||||
| ```python | ||||
| import datetime | ||||
|  | ||||
| def current_time(): | ||||
|     """Get the current local time as a string.""" | ||||
|     return str(datetime.now()) | ||||
|  | ||||
| def multiply(a: float, b: float): | ||||
|     """ | ||||
|     A function that multiplies two numbers | ||||
|      | ||||
|     Args: | ||||
|         a: The first number to multiply | ||||
|         b: The second number to multiply | ||||
|     """ | ||||
|     return a * b | ||||
|  | ||||
| tools = [current_time, multiply] | ||||
|  | ||||
| model_input = tokenizer.apply_chat_template( | ||||
|     messages, | ||||
|     tools=tools | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| In order for this to work correctly, you should write your functions in the format above, so that they can be parsed | ||||
| correctly as tools. Specifically, you should follow these rules: | ||||
|  | ||||
| - The function should have a descriptive name | ||||
| - Every argument must have a type hint | ||||
| - The function must have a docstring in the standard Google style (in other words, an initial function description   | ||||
|   followed by an `Args:` block that describes the arguments, unless the function does not have any arguments.)  | ||||
| - Do not include types in the `Args:` block. In other words, write `a: The first number to multiply`, not | ||||
|   `a (int): The first number to multiply`. Type hints should go in the function header instead. | ||||
| - The function can have a return type and a `Returns:` block in the docstring. However, these are optional | ||||
|   because most tool-use models ignore them. | ||||
|  | ||||
| ### Passing tool results to the model | ||||
|  | ||||
| The sample code above is enough to list the available tools for your model, but what happens if it wants to actually use | ||||
| one? If that happens, you should: | ||||
|  | ||||
| 1. Parse the model's output to get the tool name(s) and arguments. | ||||
| 2. Add the model's tool call(s) to the conversation. | ||||
| 3. Call the corresponding function(s) with those arguments. | ||||
| 4. Add the result(s) to the conversation | ||||
|  | ||||
| ### A complete tool use example | ||||
|  | ||||
| Let's walk through a tool use example, step by step. For this example, we will use an 8B `Hermes-2-Pro` model, | ||||
| as it is one of the highest-performing tool-use models in its size category at the time of writing. If you have the | ||||
| memory, you can consider using a larger model instead like [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) | ||||
| or [Mixtral-8x22B](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1), both of which also support tool use | ||||
| and offer even stronger performance. | ||||
|  | ||||
| First, let's load our model and tokenizer: | ||||
|  | ||||
| ```python | ||||
| import torch | ||||
| from transformers import AutoModelForCausalLM, AutoTokenizer | ||||
|  | ||||
| checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B" | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | ||||
| model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto") | ||||
| ``` | ||||
|  | ||||
| Next, let's define a list of tools: | ||||
|  | ||||
| ```python | ||||
| def get_current_temperature(location: str, unit: str) -> float: | ||||
|     """ | ||||
|     Get the current temperature at a location. | ||||
|      | ||||
|     Args: | ||||
|         location: The location to get the temperature for, in the format "City, Country" | ||||
|         unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"]) | ||||
|     Returns: | ||||
|         The current temperature at the specified location in the specified units, as a float. | ||||
|     """ | ||||
|     return 22.  # A real function should probably actually get the temperature! | ||||
|  | ||||
| def get_current_wind_speed(location: str) -> float: | ||||
|     """ | ||||
|     Get the current wind speed in km/h at a given location. | ||||
|      | ||||
|     Args: | ||||
|         location: The location to get the temperature for, in the format "City, Country" | ||||
|     Returns: | ||||
|         The current wind speed at the given location in km/h, as a float. | ||||
|     """ | ||||
|     return 6.  # A real function should probably actually get the wind speed! | ||||
|  | ||||
| tools = [get_current_temperature, get_current_wind_speed] | ||||
| ``` | ||||
|  | ||||
| Now, let's set up a conversation for our bot: | ||||
|  | ||||
| ```python | ||||
| messages = [ | ||||
|   {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."}, | ||||
|   {"role": "user", "content": "Hey, what's the temperature in Paris right now?"} | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Now, let's apply the chat template and generate a response: | ||||
|  | ||||
| ```python | ||||
| inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") | ||||
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | ||||
| out = model.generate(**inputs, max_new_tokens=128) | ||||
| print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) | ||||
| ``` | ||||
|  | ||||
| And we get: | ||||
|  | ||||
| ```text | ||||
| <tool_call> | ||||
| {"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"} | ||||
| </tool_call><|im_end|> | ||||
| ``` | ||||
|  | ||||
| The model has called the function with valid arguments, in the format requested by the function docstring. It has | ||||
| inferred that we're most likely referring to the Paris in France, and it remembered that, as the home of SI units, | ||||
| the temperature in France should certainly be displayed in Celsius. | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| The output format above is specific to the `Hermes-2-Pro` model we're using in this example. Other models may emit different | ||||
| tool call formats, and you may need to do some manual parsing at this step. For example, `Llama-3.1` models will emit | ||||
| slightly different JSON, with `parameters` instead of `arguments`. Regardless of the format the model outputs, you  | ||||
| should add the tool call to the conversation in the format below, with `tool_calls`, `function` and `arguments` keys.  | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| Next, let's append the model's tool call to the conversation. | ||||
|  | ||||
| ```python | ||||
| tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} | ||||
| messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]}) | ||||
| ``` | ||||
|  | ||||
| <Tip warning={true}> | ||||
|  | ||||
| If you're familiar with the OpenAI API, you should pay attention to an important difference here - the `tool_call` is | ||||
| a dict, but in the OpenAI API it's a JSON string. Passing a string may cause errors or strange model behaviour! | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| Now that we've added the tool call to the conversation, we can call the function and append the result to the | ||||
| conversation. Since we're just using a dummy function for this example that always returns 22.0, we can just append  | ||||
| that result directly. | ||||
|  | ||||
| ```python | ||||
| messages.append({"role": "tool", "name": "get_current_temperature", "content": "22.0"}) | ||||
| ``` | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| Some model architectures, notably Mistral/Mixtral, also require a `tool_call_id` here, which should be | ||||
| 9 randomly-generated alphanumeric characters, and assigned to the `id` key of the tool call | ||||
| dictionary. The same key should also be assigned to the `tool_call_id` key of the tool response dictionary below, so  | ||||
| that tool calls can be matched to tool responses. So, for Mistral/Mixtral models, the code above would be: | ||||
|  | ||||
| ```python | ||||
| tool_call_id = "9Ae3bDc2F"  # Random ID, 9 alphanumeric characters | ||||
| tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} | ||||
| messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]}) | ||||
| ``` | ||||
|  | ||||
| and | ||||
|  | ||||
| ```python | ||||
| messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"}) | ||||
| ``` | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
| Finally, let's let the assistant read the function outputs and continue chatting with the user: | ||||
|  | ||||
| ```python | ||||
| inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") | ||||
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | ||||
| out = model.generate(**inputs, max_new_tokens=128) | ||||
| print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) | ||||
| ``` | ||||
|  | ||||
| And we get: | ||||
|  | ||||
| ```text | ||||
| The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|> | ||||
| ``` | ||||
|  | ||||
| Although this was a simple demo with dummy tools and a single call, the same technique works with  | ||||
| multiple real tools and longer conversations. This can be a powerful way to extend the capabilities of conversational | ||||
| agents with real-time information, computational tools like calculators, or access to large databases. | ||||
|  | ||||
| ### Understanding tool schemas | ||||
|  | ||||
| Each function you pass to the `tools` argument of `apply_chat_template` is converted into a  | ||||
| [JSON schema](https://json-schema.org/learn/getting-started-step-by-step). These schemas | ||||
| are then passed to the model chat template. In other words, tool-use models do not see your functions directly, and they | ||||
| never see the actual code inside them. What they care about is the function **definitions** and the **arguments** they | ||||
| need to pass to them - they care about what the tools do and how to use them, not how they work! It is up to you | ||||
| to read their outputs, detect if they have requested to use a tool, pass their arguments to the tool function, and | ||||
| return the response in the chat. | ||||
|  | ||||
| Generating JSON schemas to pass to the template should be automatic and invisible as long as your functions | ||||
| follow the specification above, but if you encounter problems, or you simply want more control over the conversion,  | ||||
| you can handle the conversion manually. Here is an example of a manual schema conversion. | ||||
|  | ||||
| ```python | ||||
| from transformers.utils import get_json_schema | ||||
|  | ||||
| def multiply(a: float, b: float): | ||||
|     """ | ||||
|     A function that multiplies two numbers | ||||
|      | ||||
|     Args: | ||||
|         a: The first number to multiply | ||||
|         b: The second number to multiply | ||||
|     """ | ||||
|     return a * b | ||||
|  | ||||
| schema = get_json_schema(multiply) | ||||
| print(schema) | ||||
| ``` | ||||
|  | ||||
| This will yield: | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "type": "function",  | ||||
|   "function": { | ||||
|     "name": "multiply",  | ||||
|     "description": "A function that multiplies two numbers",  | ||||
|     "parameters": { | ||||
|       "type": "object",  | ||||
|       "properties": { | ||||
|         "a": { | ||||
|           "type": "number",  | ||||
|           "description": "The first number to multiply" | ||||
|         },  | ||||
|         "b": { | ||||
|           "type": "number", | ||||
|           "description": "The second number to multiply" | ||||
|         } | ||||
|       },  | ||||
|       "required": ["a", "b"] | ||||
|     } | ||||
|   } | ||||
| } | ||||
| ``` | ||||
|  | ||||
| If you wish, you can edit these schemas, or even write them from scratch yourself without using `get_json_schema` at  | ||||
| all. JSON schemas can be passed directly to the `tools` argument of  | ||||
| `apply_chat_template` - this gives you a lot of power to define precise schemas for more complex functions. Be careful, | ||||
| though - the more complex your schemas, the more likely the model is to get confused when dealing with them! We  | ||||
| recommend simple function signatures where possible, keeping arguments (and especially complex, nested arguments)  | ||||
| to a minimum. | ||||
|  | ||||
| Here is an example of defining schemas by hand, and passing them directly to `apply_chat_template`: | ||||
|  | ||||
| ```python | ||||
| # A simple function that takes no arguments | ||||
| current_time = { | ||||
|   "type": "function",  | ||||
|   "function": { | ||||
|     "name": "current_time", | ||||
|     "description": "Get the current local time as a string.", | ||||
|     "parameters": { | ||||
|       'type': 'object', | ||||
|       'properties': {} | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| # A more complete function that takes two numerical arguments | ||||
| multiply = { | ||||
|   'type': 'function', | ||||
|   'function': { | ||||
|     'name': 'multiply', | ||||
|     'description': 'A function that multiplies two numbers',  | ||||
|     'parameters': { | ||||
|       'type': 'object',  | ||||
|       'properties': { | ||||
|         'a': { | ||||
|           'type': 'number', | ||||
|           'description': 'The first number to multiply' | ||||
|         },  | ||||
|         'b': { | ||||
|           'type': 'number', 'description': 'The second number to multiply' | ||||
|         } | ||||
|       },  | ||||
|       'required': ['a', 'b'] | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| model_input = tokenizer.apply_chat_template( | ||||
|     messages, | ||||
|     tools = [current_time, multiply] | ||||
| ) | ||||
| ``` | ||||
|  | ||||
| ## Retrieval-augmented generation | ||||
|  | ||||
| "Retrieval-augmented generation" or "RAG" LLMs can search a corpus of documents for information before responding | ||||
| to a query. This allows models to vastly expand their knowledge base beyond their limited context size. Our  | ||||
| recommendation for RAG models is that their template | ||||
| should accept a `documents` argument. This should be a list of documents, where each "document" | ||||
| is a single dict with `title` and `contents` keys, both of which are strings. Because this format is much simpler | ||||
| than the JSON schemas used for tools, no helper functions are necessary. | ||||
|  | ||||
| Here's an example of a RAG template in action: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoTokenizer, AutoModelForCausalLM | ||||
|  | ||||
| # Load the model and tokenizer | ||||
| model_id = "CohereForAI/c4ai-command-r-v01-4bit" | ||||
| tokenizer = AutoTokenizer.from_pretrained(model_id) | ||||
| model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") | ||||
| device = model.device # Get the device the model is loaded on | ||||
|  | ||||
| # Define conversation input | ||||
| conversation = [ | ||||
|     {"role": "user", "content": "What has Man always dreamed of?"} | ||||
| ] | ||||
|  | ||||
| # Define documents for retrieval-based generation | ||||
| documents = [ | ||||
|     { | ||||
|         "title": "The Moon: Our Age-Old Foe",  | ||||
|         "text": "Man has always dreamed of destroying the moon. In this essay, I shall..." | ||||
|     }, | ||||
|     { | ||||
|         "title": "The Sun: Our Age-Old Friend", | ||||
|         "text": "Although often underappreciated, the sun provides several notable benefits..." | ||||
|     } | ||||
| ] | ||||
|  | ||||
| # Tokenize conversation and documents using a RAG template, returning PyTorch tensors. | ||||
| input_ids = tokenizer.apply_chat_template( | ||||
|     conversation=conversation, | ||||
|     documents=documents, | ||||
|     chat_template="rag", | ||||
|     tokenize=True, | ||||
|     add_generation_prompt=True, | ||||
|     return_tensors="pt").to(device) | ||||
|  | ||||
| # Generate a response  | ||||
| gen_tokens = model.generate( | ||||
|     input_ids, | ||||
|     max_new_tokens=100, | ||||
|     do_sample=True, | ||||
|     temperature=0.3, | ||||
|     ) | ||||
|  | ||||
| # Decode and print the generated text along with generation prompt | ||||
| gen_text = tokenizer.decode(gen_tokens[0]) | ||||
| print(gen_text) | ||||
| ``` | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| The `documents` input for retrieval-augmented generation is not widely supported, and many models have chat templates which simply ignore this input. | ||||
|  | ||||
| To verify if a model supports the `documents` input, you can read its model card, or `print(tokenizer.chat_template)` to see if the `documents` key is used anywhere. | ||||
|  | ||||
| One model class that does support it, though, is Cohere's [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024), through their `rag` chat template. You can see additional examples of grounded generation using this feature in their model cards. | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
|  | ||||
| @ -1,229 +0,0 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Templates | ||||
|  | ||||
| The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format. | ||||
|  | ||||
| In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model. | ||||
|  | ||||
| <hfoptions id="template"> | ||||
| <hfoption id="Mistral"> | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoTokenizer | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") | ||||
| chat = [ | ||||
|   {"role": "user", "content": "Hello, how are you?"}, | ||||
|   {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, | ||||
|   {"role": "user", "content": "I'd like to show off how chat templating works!"}, | ||||
| ] | ||||
|  | ||||
| tokenizer.apply_chat_template(chat, tokenize=False) | ||||
| ``` | ||||
| ```md | ||||
| <s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST] | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="Zephyr"> | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoTokenizer | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") | ||||
| chat = [ | ||||
|   {"role": "user", "content": "Hello, how are you?"}, | ||||
|   {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, | ||||
|   {"role": "user", "content": "I'd like to show off how chat templating works!"}, | ||||
| ] | ||||
|  | ||||
| tokenizer.apply_chat_template(chat, tokenize=False) | ||||
| ``` | ||||
| ```md | ||||
| <|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
|  | ||||
| This guide explores [`apply_chat_template`] and chat templates in more detail. | ||||
|  | ||||
| ## apply_chat_template | ||||
|  | ||||
| Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it. | ||||
|  | ||||
| Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message. | ||||
|  | ||||
| ```py | ||||
| import torch | ||||
| from transformers import AutoModelForCausalLM, AutoTokenizer | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") | ||||
| model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16) | ||||
|  | ||||
| messages = [ | ||||
|     {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",}, | ||||
|     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, | ||||
|  ] | ||||
| tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") | ||||
| print(tokenizer.decode(tokenized_chat[0])) | ||||
| ``` | ||||
| ```md | ||||
| <|system|> | ||||
| You are a friendly chatbot who always responds in the style of a pirate</s> | ||||
| <|user|> | ||||
| How many helicopters can a human eat in one sitting?</s> | ||||
| <|assistant|> | ||||
| ``` | ||||
|  | ||||
| Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response. | ||||
|  | ||||
| ```py | ||||
| outputs = model.generate(tokenized_chat, max_new_tokens=128)  | ||||
| print(tokenizer.decode(outputs[0])) | ||||
| ``` | ||||
| ```md | ||||
| <|system|> | ||||
| You are a friendly chatbot who always responds in the style of a pirate</s> | ||||
| <|user|> | ||||
| How many helicopters can a human eat in one sitting?</s> | ||||
| <|assistant|> | ||||
| Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all. | ||||
| ``` | ||||
|  | ||||
| ### add_generation_prompt | ||||
| The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message. | ||||
|  | ||||
| Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect. | ||||
|  | ||||
| ```py | ||||
| tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) | ||||
| tokenized_chat | ||||
| ``` | ||||
| ```md | ||||
| <|im_start|>user | ||||
| Hi there!<|im_end|> | ||||
| <|im_start|>assistant | ||||
| Nice to meet you!<|im_end|> | ||||
| <|im_start|>user | ||||
| Can I ask a question?<|im_end|> | ||||
| ``` | ||||
|  | ||||
| ### continue_final_message | ||||
|  | ||||
| The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message. | ||||
|  | ||||
| This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies. | ||||
|  | ||||
| ```py | ||||
| chat = [ | ||||
|     {"role": "user", "content": "Can you format the answer in JSON?"}, | ||||
|     {"role": "assistant", "content": '{"name": "'}, | ||||
| ] | ||||
|  | ||||
| formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True) | ||||
| model.generate(**formatted_chat) | ||||
| ``` | ||||
|  | ||||
| > [!WARNING] | ||||
| > You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error. | ||||
|  | ||||
| [`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline. | ||||
|  | ||||
| ## Multiple templates | ||||
|  | ||||
| A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG. | ||||
|  | ||||
| When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error. | ||||
|  | ||||
| For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`. | ||||
|  | ||||
| To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`. | ||||
|  | ||||
| It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template. | ||||
|  | ||||
| ## Template selection | ||||
|  | ||||
| It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant. | ||||
|  | ||||
| But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template. | ||||
|  | ||||
| ```py | ||||
| {%- for message in messages %} | ||||
|     {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} | ||||
| {%- endfor %} | ||||
| ``` | ||||
|  | ||||
| Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with. | ||||
|  | ||||
| ```py | ||||
| tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" | ||||
| ``` | ||||
|  | ||||
| The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`]. | ||||
|  | ||||
| ```py | ||||
| <|im_start|>system | ||||
| You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> | ||||
| <|im_start|>user | ||||
| How are you?<|im_end|> | ||||
| <|im_start|>assistant | ||||
| I'm doing great!<|im_end|> | ||||
| ``` | ||||
|  | ||||
| ## Model training | ||||
|  | ||||
| Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training. | ||||
|  | ||||
| An example of preprocessing a dataset with a chat template is shown below. | ||||
|  | ||||
| ```py | ||||
| from transformers import AutoTokenizer | ||||
| from datasets import Dataset | ||||
|  | ||||
| tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") | ||||
|  | ||||
| chat1 = [ | ||||
|     {"role": "user", "content": "Which is bigger, the moon or the sun?"}, | ||||
|     {"role": "assistant", "content": "The sun."} | ||||
| ] | ||||
| chat2 = [ | ||||
|     {"role": "user", "content": "Which is bigger, a virus or a bacterium?"}, | ||||
|     {"role": "assistant", "content": "A bacterium."} | ||||
| ] | ||||
|  | ||||
| dataset = Dataset.from_dict({"chat": [chat1, chat2]}) | ||||
| dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) | ||||
| print(dataset['formatted_chat'][0]) | ||||
| ``` | ||||
| ```md | ||||
| <|user|> | ||||
| Which is bigger, the moon or the sun?</s> | ||||
| <|assistant|> | ||||
| The sun.</s> | ||||
| ``` | ||||
|  | ||||
| After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column. | ||||
|  | ||||
| Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them. | ||||
|  | ||||
| ```py | ||||
| apply_chat_template(messages, tokenize=False, add_special_tokens=False) | ||||
| ``` | ||||
|  | ||||
| This isn’t an issue if `apply_chat_template(tokenize=True)`. | ||||
| @ -1,243 +0,0 @@ | ||||
| <!--Copyright 2025 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Multimodal templates | ||||
|  | ||||
| Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`. | ||||
|  | ||||
| Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text. | ||||
|  | ||||
| This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template | ||||
|  | ||||
| ## ImageTextToTextPipeline | ||||
|  | ||||
| [`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format). | ||||
|  | ||||
| Start by building a chat history with the following two roles. | ||||
|  | ||||
| - `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models. | ||||
| - `user` is where you enter your first message to the model. | ||||
|  | ||||
| ```py | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, | ||||
|             {"type": "text", "text": "What are these?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory. | ||||
|  | ||||
| > [!TIP] | ||||
| > The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible.  | ||||
|  | ||||
| ```python | ||||
| import torch | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16) | ||||
| pipeline(text=messages, max_new_tokens=50, return_full_text=False) | ||||
| [{'input_text': [{'role': 'system', | ||||
|     'content': [{'type': 'text', | ||||
|       'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]}, | ||||
|    {'role': 'user', | ||||
|     'content': [{'type': 'image', | ||||
|       'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'}, | ||||
|      {'type': 'text', 'text': 'What are these?'}]}], | ||||
|   'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}] | ||||
| ``` | ||||
|  | ||||
| ## Image inputs | ||||
|  | ||||
| For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below. | ||||
|  | ||||
| - The content `"type"` can be an `"image"` or `"text"`. | ||||
| - For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model. | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration | ||||
|  | ||||
| model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") | ||||
| processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") | ||||
|  | ||||
| messages = [ | ||||
|     { | ||||
|       "role": "system", | ||||
|       "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, | ||||
|             {"type": "text", "text": "What are these?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`. | ||||
|  | ||||
| ```py | ||||
| processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt") | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
| These inputs are now ready to be used in [`~GenerationMixin.generate`]. | ||||
|  | ||||
| ## Video inputs | ||||
|  | ||||
| Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs). | ||||
|  | ||||
| - The content `"type"` should be `"video"` to indicate the content is a video. | ||||
| - For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord). | ||||
|  | ||||
| > [!WARNING] | ||||
| > Loading a video from `"url"` is only supported by the PyAV or Decord backends. | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration | ||||
|  | ||||
| model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" | ||||
| model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) | ||||
| processor = AutoProcessor.from_pretrained(model_id) | ||||
|  | ||||
| messages = [ | ||||
|     { | ||||
|       "role": "system", | ||||
|       "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"}, | ||||
|             {"type": "text", "text": "What do you see in this video?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process. | ||||
|  | ||||
| The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html). | ||||
|  | ||||
| The examples below use Decord as the backend because it is a bit faster than PyAV. | ||||
|  | ||||
| <hfoptions id="sampling"> | ||||
| <hfoption id="fixed number of frames"> | ||||
|  | ||||
| The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling. | ||||
|  | ||||
|  | ||||
| ```python | ||||
| processed_chat = processor.apply_chat_template( | ||||
|     messages, | ||||
|     add_generation_prompt=True, | ||||
|     tokenize=True, | ||||
|     return_dict=True, | ||||
|     return_tensors="pt", | ||||
|     num_frames=32, | ||||
|     video_load_backend="decord", | ||||
| ) | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
| These inputs are now ready to be used in [`~GenerationMixin.generate`]. | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="fps"> | ||||
|  | ||||
| For longer videos, it may be better to sample more frames for better representation with the `video_fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `video_fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds. | ||||
|  | ||||
| ```py | ||||
| processed_chat = processor.apply_chat_template( | ||||
|     messages, | ||||
|     add_generation_prompt=True, | ||||
|     tokenize=True, | ||||
|     return_dict=True, | ||||
|     video_fps=32, | ||||
|     video_load_backend="decord", | ||||
| ) | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| <hfoption id="list of image frames"> | ||||
|  | ||||
| Videos may also exist as a set of sampled frames stored as images rather than the full video file. | ||||
|  | ||||
| In this case, pass a list of image file paths and the processor automatically concatenates them into a video. Make sure all images are the same size since they are assumed to be from the same video. | ||||
|  | ||||
| ```py | ||||
| frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"] | ||||
| messages = [ | ||||
|     { | ||||
|         "role": "system", | ||||
|         "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], | ||||
|     }, | ||||
|     { | ||||
|       "role": "user", | ||||
|       "content": [ | ||||
|             {"type": "video", "path": frames_paths}, | ||||
|             {"type": "text", "text": "What do you see in this video?"}, | ||||
|         ], | ||||
|     }, | ||||
| ] | ||||
|  | ||||
| processed_chat = processor.apply_chat_template( | ||||
|     messages, | ||||
|     add_generation_prompt=True, | ||||
|     tokenize=True, | ||||
|     return_dict=True, | ||||
| ) | ||||
| print(processed_chat.keys()) | ||||
| ``` | ||||
|  | ||||
| </hfoption> | ||||
| </hfoptions> | ||||
|  | ||||
| ## Template configuration | ||||
|  | ||||
| You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details. | ||||
|  | ||||
| For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json). | ||||
|  | ||||
| ```jinja | ||||
| {% for message in messages %} | ||||
| {% if loop.index0 == 0 %}{{ bos_token }}{% endif %} | ||||
| {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} | ||||
| {% if message['content'] is string %} | ||||
| {{ message['content'] }} | ||||
| {% else %} | ||||
| {% for content in message['content'] %} | ||||
| {% if content['type'] == 'image' %} | ||||
| {{ '<|image|>' }} | ||||
| {% elif content['type'] == 'text' %} | ||||
| {{ content['text'] }} | ||||
| {% endif %} | ||||
| {% endfor %} | ||||
| {% endif %} | ||||
| {{ '<|eot_id|>' }} | ||||
| {% endfor %} | ||||
| {% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %} | ||||
| ``` | ||||
| @ -1,251 +0,0 @@ | ||||
| <!--Copyright 2024 The HuggingFace Team. All rights reserved. | ||||
|  | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | ||||
| the License. You may obtain a copy of the License at | ||||
|  | ||||
| http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  | ||||
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | ||||
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||||
| specific language governing permissions and limitations under the License. | ||||
|  | ||||
| ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | ||||
| rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Template writing | ||||
|  | ||||
| A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles. | ||||
|  | ||||
| 1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.). | ||||
| 2. Print the message followed by an end-of-sequence (`EOS`) token. | ||||
| 3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response. | ||||
|  | ||||
| An example template is shown below. | ||||
|  | ||||
| ```jinja | ||||
| {%- for message in messages %} | ||||
|     {{- '<|' + message['role'] + |>\n' }} | ||||
|     {{- message['content'] + eos_token }} | ||||
| {%- endfor %} | ||||
| {%- if add_generation_prompt %} | ||||
|     {{- '<|assistant|>\n' }} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips. | ||||
|  | ||||
| ## Create a template | ||||
|  | ||||
| Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages. | ||||
|  | ||||
| ```jinja | ||||
| {%- for message in messages %} | ||||
|     {%- if message['role'] == 'user' %} | ||||
|         {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} | ||||
|     {%- elif message['role'] == 'system' %} | ||||
|         {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }} | ||||
|     {%- elif message['role'] == 'assistant' %} | ||||
|         {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }} | ||||
|     {%- endif %} | ||||
| {%- endfor %} | ||||
| ``` | ||||
|  | ||||
| Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used. | ||||
|  | ||||
| ```py | ||||
| template = tokenizer.chat_template | ||||
| template = template.replace("SYS", "SYSTEM")  # Change the system token | ||||
| tokenizer.chat_template = template  # Set the new template | ||||
| ``` | ||||
|  | ||||
| The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model. | ||||
|  | ||||
| ```py | ||||
| tokenizer.push_to_hub("model_name") | ||||
| ``` | ||||
|  | ||||
| ## Template writing tips | ||||
|  | ||||
| The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax. | ||||
|  | ||||
| This section curates some best practices for writing clean and efficient Jinja templates. | ||||
|  | ||||
| ### Trimming whitespace | ||||
|  | ||||
| Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block. | ||||
|  | ||||
| ```jinja | ||||
| {%- for message in messages %} | ||||
|     {{- message['role'] + message['content'] }} | ||||
| {%- endfor %} | ||||
| ``` | ||||
|  | ||||
| The incorrect whitespace usage example below may introduce a newline and indentation in the output. | ||||
|  | ||||
| ```jinja | ||||
| {% for message in messages %} | ||||
|     {{ message['role'] + message['content'] }} | ||||
| {% endfor %} | ||||
| ``` | ||||
|  | ||||
| ### Special variables | ||||
|  | ||||
| There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments. | ||||
|  | ||||
| - `messages` contains the chat history as a list of message dicts. | ||||
| - `tools` contains a list of tools in JSON schema format. | ||||
| - `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models). | ||||
| - `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation. | ||||
| - `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`. | ||||
|  | ||||
| ### Callable functions | ||||
|  | ||||
| There are two callable functions available inside a template. | ||||
|  | ||||
| - `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage. | ||||
| - `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python. | ||||
|  | ||||
| ### Compatibility with non-Python Jinja | ||||
|  | ||||
| Jinja is implemented in multiple languages and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust. | ||||
|  | ||||
| Make the changes below to ensure compatibility across all Jinja implementations. | ||||
|  | ||||
| - Replace Python methods with Jinja filters. For example, replace `string.lower()` with `string|lower` or `dict.items()` with `dict|dictitems`. Most of the changes follow the same pattern except `string.strip()`, which is replaced with `string|trim`. Refer to the list of [built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) for a complete list of filters. | ||||
| - Replace `True`, `False`, and `None` (these are Python specific) with `true`, `false`, and `none` respectively. | ||||
| - Directly rendering a dict or list may return different results in other implementations. For example, string entries may change from single-quote to double-quote. To avoid this, add the [tojson](https://jinja.palletsprojects.com/en/3.1.x/templates/#jinja-filters.tojson) filter to maintain consistency. | ||||
|  | ||||
| ### Big templates | ||||
|  | ||||
| Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues. | ||||
|  | ||||
| Write the template in a separate file and extract it to the chat template. | ||||
|  | ||||
| ```py | ||||
| open("template.jinja", "w").write(tokenizer.chat_template) | ||||
| ``` | ||||
|  | ||||
| You could also load an edited template back into the tokenizer. | ||||
|  | ||||
| ```py | ||||
| tokenizer.chat_template = open("template.jinja").read() | ||||
| ``` | ||||
|  | ||||
| ## Templates for tools | ||||
|  | ||||
| There isn't a specific format for writing templates for tools but it is best to follow the standard API. This ensures the template is widely accessible across models without requiring users to write custom code to use tools with your model. | ||||
|  | ||||
| > [!WARNING] | ||||
| > Formatting such as whitespace and special tokens are model-specific. Make sure everything exactly matches the format a model was trained with. | ||||
|  | ||||
| The following section lists elements of the standard API for writing templates for tools. | ||||
|  | ||||
| ### Tool definitions | ||||
|  | ||||
| Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas. | ||||
|  | ||||
| The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers. | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "type": "function",  | ||||
|   "function": { | ||||
|     "name": "multiply",  | ||||
|     "description": "A function that multiplies two numbers",  | ||||
|     "parameters": { | ||||
|       "type": "object",  | ||||
|       "properties": { | ||||
|         "a": { | ||||
|           "type": "number",  | ||||
|           "description": "The first number to multiply" | ||||
|         },  | ||||
|         "b": { | ||||
|           "type": "number", | ||||
|           "description": "The second number to multiply" | ||||
|         } | ||||
|       },  | ||||
|       "required": ["a", "b"] | ||||
|     } | ||||
|   } | ||||
| } | ||||
| ``` | ||||
|  | ||||
| An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with. | ||||
|  | ||||
| ``` | ||||
| {%- if tools %} | ||||
|     {%- for tool in tools %} | ||||
|         {{- '<tool>' + tool['function']['name'] + '\n' }} | ||||
|         {%- for argument in tool['function']['parameters']['properties'] %} | ||||
|             {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }} | ||||
|         {%- endfor %} | ||||
|         {{- '\n</tool>' }} | ||||
|     {%- endif %} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| ### Tool calls | ||||
|  | ||||
| Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element. | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "role": "assistant", | ||||
|   "tool_calls": [ | ||||
|     { | ||||
|       "type": "function", | ||||
|       "function": { | ||||
|         "name": "multiply", | ||||
|         "arguments": { | ||||
|           "a": 5, | ||||
|           "b": 6 | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|   ] | ||||
| } | ||||
| ``` | ||||
|  | ||||
| A common pattern for handling tool calls is shown below. | ||||
|  | ||||
| ``` | ||||
| {%- if message['role'] == 'assistant' and 'tool_calls' in message %} | ||||
|     {%- for tool_call in message['tool_calls'] %} | ||||
|             {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }} | ||||
|         {%- endif %} | ||||
|     {%- endfor %} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| ### Tool responses | ||||
|  | ||||
| Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys. | ||||
|  | ||||
| ```json | ||||
| { | ||||
|   "role": "tool", | ||||
|   "name": "multiply", | ||||
|   "content": "30" | ||||
| } | ||||
| ``` | ||||
|  | ||||
| Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`. | ||||
|  | ||||
| ``` | ||||
| {%- if message['role'] == 'tool' %} | ||||
|     {{- "<tool_result>" + message['content'] + "</tool_result>" }} | ||||
| {%- endif %} | ||||
| ``` | ||||
|  | ||||
| ## Contribute | ||||
|  | ||||
| Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`]. | ||||
|  | ||||
| Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template. | ||||
|  | ||||
| ```py | ||||
| tokenizer.chat_template = template | ||||
| tokenizer.push_to_hub("model_name") | ||||
| ``` | ||||
| @ -14,71 +14,61 @@ rendered properly in your Markdown viewer. | ||||
|  | ||||
| --> | ||||
|  | ||||
| # Chat basics | ||||
| # Chatting with Transformers | ||||
|  | ||||
| Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter. | ||||
| If you're reading this article, you're almost certainly aware of **chat models**. Chat models are conversational | ||||
| AIs that you can send and receive messages with. The most famous of these is the proprietary ChatGPT, but there are | ||||
| now many open-source chat models which match or even substantially exceed its performance. These models are free to | ||||
| download and run on a local machine. Although the largest and most capable models require high-powered hardware | ||||
| and lots of memory to run, there are smaller models that will run perfectly well on a single consumer GPU, or even | ||||
| an ordinary desktop or notebook CPU.  | ||||
|  | ||||
| Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models. | ||||
| This guide will help you get started with chat models. We'll start with a brief quickstart guide that uses a convenient, | ||||
| high-level "pipeline". This is all you need if you just want to start running a chat model  | ||||
| immediately. After the quickstart, we'll move on to more detailed information about | ||||
| what exactly chat models are, how to choose an appropriate one, and a low-level breakdown of each of the | ||||
| steps involved in talking to a chat model. We'll also give some tips on optimizing the performance and memory usage | ||||
| of your chat models. | ||||
|  | ||||
| > [!TIP] | ||||
| > Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)! | ||||
|  | ||||
| This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`]. | ||||
| ## Quickstart | ||||
|  | ||||
| ## transformers CLI | ||||
| If you have no time for details, here's the brief summary: Chat models continue chats. This means that you pass them | ||||
| a conversation history, which can be as short as a single user message, and the model will continue the conversation | ||||
| by adding its response. Let's see this in action. First, let's build a chat: | ||||
|  | ||||
| After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session. | ||||
|  | ||||
| ```bash | ||||
| transformers chat Qwen/Qwen2.5-0.5B-Instruct | ||||
| ``` | ||||
|  | ||||
| <div class="flex justify-center"> | ||||
|     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/> | ||||
| </div> | ||||
|  | ||||
| You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...` | ||||
|  | ||||
| ```bash | ||||
| transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10 | ||||
| ``` | ||||
|  | ||||
| For a full list of options, run the command below. | ||||
|  | ||||
| ```bash | ||||
| transformers chat -h | ||||
| ``` | ||||
|  | ||||
| The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). | ||||
|  | ||||
| ## TextGenerationPipeline | ||||
|  | ||||
| [`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format). | ||||
|  | ||||
| To start, build a chat history with the following two roles. | ||||
|  | ||||
| - `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models. | ||||
| - `user` is where you enter your first message to the model. | ||||
|  | ||||
| ```py | ||||
| ```python | ||||
| chat = [ | ||||
|     {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."}, | ||||
|     {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"} | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory. | ||||
| Notice that in addition to the user's message, we added a **system** message at the start of the conversation. Not all | ||||
| chat models support system messages, but when they do, they represent high-level directives about how the model | ||||
| should behave in the conversation. You can use this to guide the model - whether you want short or long responses, | ||||
| lighthearted or serious ones, and so on. If you want the model to do useful work instead of | ||||
| practicing its improv routine, you can either omit the system message or try a terse one such as "You are a helpful and intelligent | ||||
| AI assistant who responds to user queries." | ||||
|  | ||||
| ```py | ||||
| Once you have a chat, the quickest way to continue it is using the [`TextGenerationPipeline`].  | ||||
| Let's see this in action with `LLaMA-3`. Note that `LLaMA-3` is a gated model, which means you will need to  | ||||
| [apply for access](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and log in with your Hugging Face  | ||||
| account to use it. We'll also use `device_map="auto"`, which will load the model on GPU if there's enough memory | ||||
| for it, and set the dtype to `torch.bfloat16` to save memory: | ||||
|  | ||||
| ```python | ||||
| import torch | ||||
| from transformers import pipeline | ||||
|  | ||||
| pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto") | ||||
| response = pipeline(chat, max_new_tokens=512) | ||||
| print(response[0]["generated_text"][-1]["content"]) | ||||
| pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto") | ||||
| response = pipe(chat, max_new_tokens=512) | ||||
| print(response[0]['generated_text'][-1]['content']) | ||||
| ``` | ||||
|  | ||||
| ```txt | ||||
| And you'll get: | ||||
|  | ||||
| ```text | ||||
| (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,  | ||||
| alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide! | ||||
|  | ||||
| @ -101,18 +91,22 @@ So, there you have it, pal! That's my expert advice on what to do in New York. N | ||||
| excuse me, I've got some oil changes to attend to. (winks) | ||||
| ``` | ||||
|  | ||||
| Use the `append` method on `chat` to respond to the models message. | ||||
| You can continue the chat by appending your own response to it. The | ||||
| `response` object returned by the pipeline actually contains the entire chat so far, so we can simply append | ||||
| a message and pass it back: | ||||
|  | ||||
| ```py | ||||
| chat = response[0]["generated_text"] | ||||
| ```python | ||||
| chat = response[0]['generated_text'] | ||||
| chat.append( | ||||
|     {"role": "user", "content": "Wait, what's so wild about soup cans?"} | ||||
| ) | ||||
| response = pipeline(chat, max_new_tokens=512) | ||||
| print(response[0]["generated_text"][-1]["content"]) | ||||
| response = pipe(chat, max_new_tokens=512) | ||||
| print(response[0]['generated_text'][-1]['content']) | ||||
| ``` | ||||
|  | ||||
| ```txt | ||||
| And you'll get: | ||||
|  | ||||
| ```text | ||||
| (laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!  | ||||
| It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's  | ||||
| like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"  | ||||
| @ -126,35 +120,171 @@ But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. ( | ||||
| But, hey, that's what makes art, art, right? (laughs) | ||||
| ``` | ||||
|  | ||||
| ## Performance | ||||
| The remainder of this tutorial will cover specific topics such | ||||
| as performance and memory, or how to select a chat model for your needs. | ||||
|  | ||||
| Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index). | ||||
| ## Choosing a chat model | ||||
|  | ||||
| > [!TIP] | ||||
| > Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available. | ||||
| There are an enormous number of different chat models available on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending), | ||||
| and new users often feel very overwhelmed by the selection offered. Don't be, though! You really need to just focus on | ||||
| two important considerations:  | ||||
| - The model's size, which will determine if you can fit it in memory and how quickly it will | ||||
| run. | ||||
| - The quality of the model's chat output. | ||||
|  | ||||
| Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits. | ||||
| In general, these are correlated - bigger models tend to be  | ||||
| more capable, but even so there's a lot of variation at a given size point! | ||||
|  | ||||
| ```py | ||||
| from transformers import pipeline, BitsAndBytesConfig | ||||
| ### Size and model naming | ||||
| The size of a model is easy to spot - it's the number in the model name, like "8B" or "70B". This is the number of | ||||
| **parameters** in the model. Without quantization, you should expect to need about 2 bytes of memory per parameter. | ||||
| This means that an "8B" model with 8 billion parameters will need about 16GB of memory just to fit the parameters,  | ||||
| plus a little extra for other overhead. It's a good fit for a high-end consumer GPU with 24GB of memory, such as a 3090 | ||||
| or 4090. | ||||
|  | ||||
| quantization_config = BitsAndBytesConfig(load_in_8bit=True) | ||||
| pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config}) | ||||
| Some chat models are "Mixture of Experts" models. These may list their sizes in different ways, such as "8x7B" or  | ||||
| "141B-A35B". The numbers are a little fuzzier here, but in general you can read this as saying that the model | ||||
| has approximately 56 (8x7) billion parameters in the first case, or 141 billion parameters in the second case. | ||||
|  | ||||
| Note that it is very common to use quantization techniques to reduce the memory usage per parameter to 8 bits, 4 bits, | ||||
| or even less. This topic is discussed in more detail in the [Memory considerations](#memory-considerations) section below. | ||||
|  | ||||
| ### But which chat model is best? | ||||
| Even once you know the size of chat model you can run, there's still a lot of choice out there. One way to sift through | ||||
| it all is to consult **leaderboards**. Two of the most popular leaderboards are the [OpenLLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) | ||||
| and the [LMSys Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard). Note that the LMSys leaderboard | ||||
| also includes proprietary models - look at the `licence` column to identify open-source ones that you can download, then | ||||
| search for them on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending). | ||||
|  | ||||
| ### Specialist domains | ||||
| Some models may be specialized for certain domains, such as medical or legal text, or non-English languages.  | ||||
| If you're working in these domains, you may find that a specialized model will give you big performance benefits.  | ||||
| Don't automatically assume that, though! Particularly when specialized models are smaller or older than the current  | ||||
| cutting-edge, a top-end general-purpose model may still outclass them. Thankfully, we are beginning to see  | ||||
| [domain-specific leaderboards](https://huggingface.co/blog/leaderboard-medicalllm) that should make it easier to locate | ||||
| the best models for specialized domains. | ||||
|  | ||||
| ## What happens inside the pipeline? | ||||
|  | ||||
| The quickstart above used a high-level pipeline to chat with a chat model, which is convenient, but not the | ||||
| most flexible. Let's take a more low-level approach, to see each of the steps involved in chat. Let's start with | ||||
| a code sample, and then break it down: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoModelForCausalLM, AutoTokenizer | ||||
| import torch | ||||
|  | ||||
| # Prepare the input as before | ||||
| chat = [ | ||||
|     {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."}, | ||||
|     {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"} | ||||
| ] | ||||
|  | ||||
| # 1: Load the model and tokenizer | ||||
| model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16) | ||||
| tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") | ||||
|  | ||||
| # 2: Apply the chat template | ||||
| formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) | ||||
| print("Formatted chat:\n", formatted_chat) | ||||
|  | ||||
| # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True) | ||||
| inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False) | ||||
| # Move the tokenized inputs to the same device the model is on (GPU/CPU) | ||||
| inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()} | ||||
| print("Tokenized inputs:\n", inputs) | ||||
|  | ||||
| # 4: Generate text from the model | ||||
| outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1) | ||||
| print("Generated tokens:\n", outputs) | ||||
|  | ||||
| # 5: Decode the output back to a string | ||||
| decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True) | ||||
| print("Decoded output:\n", decoded_output) | ||||
| ``` | ||||
|  | ||||
| In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token. | ||||
| There's a lot in here, each piece of which could be its own document! Rather than going into too much detail, I'll cover | ||||
| the broad ideas, and leave the details for the linked documents. The key steps are: | ||||
|  | ||||
| The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types. | ||||
| 1. [Models](https://huggingface.co/learn/nlp-course/en/chapter2/3) and [Tokenizers](https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt) are loaded from the Hugging Face Hub. | ||||
| 2. The chat is formatted using the tokenizer's [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating) | ||||
| 3. The formatted chat is [tokenized](https://huggingface.co/learn/nlp-course/en/chapter2/4) using the tokenizer. | ||||
| 4. We [generate](https://huggingface.co/docs/transformers/en/llm_tutorial) a response from the model. | ||||
| 5. The tokens output by the model are decoded back to a string | ||||
|  | ||||
| | Hardware | Memory bandwidth | | ||||
| |---|---| | ||||
| | consumer CPU | 20-100GB/sec | | ||||
| | specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec | | ||||
| | data center GPU (NVIDIA A100/H100) | 2-3TB/sec | | ||||
| ## Performance, memory and hardware | ||||
|  | ||||
| The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth. | ||||
| You probably know by now that most machine learning tasks are run on GPUs. However, it is entirely possible | ||||
| to generate text from a chat model or language model on a CPU, albeit somewhat more slowly. If you can fit | ||||
| the model in GPU memory, though, this will usually be the preferable option. | ||||
|  | ||||
| You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed. | ||||
| ### Memory considerations | ||||
|  | ||||
| By default, Hugging Face classes like [`TextGenerationPipeline`] or [`AutoModelForCausalLM`] will load the model in  | ||||
| `float32` precision. This means that it will need 4 bytes (32 bits) per parameter, so an "8B" model with 8 billion | ||||
| parameters will need ~32GB of memory. However, this can be wasteful! Most modern language models are trained in  | ||||
| "bfloat16" precision, which uses only 2 bytes per parameter. If your hardware supports it (Nvidia 30xx/Axxx | ||||
| or newer), you can load the model in `bfloat16` precision, using the `torch_dtype` argument as we did above. | ||||
|  | ||||
| It is possible to go even lower than 16-bits using "quantization", a method to lossily compress model weights. This | ||||
| allows each parameter to be squeezed down to 8 bits, 4 bits or even less. Note that, especially at 4 bits, | ||||
| the model's outputs may be negatively affected, but often this is a tradeoff worth making to fit a larger and more | ||||
| capable chat model in memory. Let's see this in action with `bitsandbytes`: | ||||
|  | ||||
| ```python | ||||
| from transformers import AutoModelForCausalLM, BitsAndBytesConfig | ||||
|  | ||||
| quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit | ||||
| model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", quantization_config=quantization_config) | ||||
| ``` | ||||
|  | ||||
| Or we can do the same thing using the `pipeline` API: | ||||
|  | ||||
| ```python | ||||
| from transformers import pipeline, BitsAndBytesConfig | ||||
|  | ||||
| quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit | ||||
| pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config}) | ||||
| ``` | ||||
|  | ||||
| There are several other options for quantizing models besides `bitsandbytes` - please see the [Quantization guide](./quantization) | ||||
| for more information. | ||||
|  | ||||
| ### Performance considerations | ||||
|  | ||||
| <Tip> | ||||
|  | ||||
| For a more extensive guide on language model performance and optimization, check out [LLM Inference Optimization](./llm_optims) . | ||||
|  | ||||
| </Tip> | ||||
|  | ||||
|  | ||||
| As a general rule, larger chat models will be slower in addition to requiring more memory. It's possible to be | ||||
| more concrete about this, though: Generating text from a chat model is unusual in that it is bottlenecked by | ||||
| **memory bandwidth** rather than compute power, because every active parameter must be read from memory for each | ||||
| token that the model generates. This means that number of tokens per second you can generate from a chat | ||||
| model is generally proportional to the total bandwidth of the memory it resides in, divided by the size of the model. | ||||
|  | ||||
| In our quickstart example above, our model was ~16GB in size when loaded in `bfloat16` precision.  | ||||
| This means that 16GB must be read from memory for every token generated by the model. Total memory bandwidth can | ||||
| vary from 20-100GB/sec for consumer CPUs to 200-900GB/sec for consumer GPUs, specialized CPUs like | ||||
| Intel Xeon, AMD Threadripper/Epyc or high-end Apple silicon, and finally up to 2-3TB/sec for data center GPUs like | ||||
| the Nvidia A100 or H100. This should give you a good idea of the generation speed you can expect from these different | ||||
| hardware types. | ||||
|  | ||||
| Therefore, if you want to improve the speed of text generation, the easiest solution is to either reduce the | ||||
| size of the model in memory (usually by quantization), or get hardware with higher memory bandwidth. For advanced users,  | ||||
| several other techniques exist to get around this bandwidth bottleneck. The most common are variants on  | ||||
| [assisted generation](https://huggingface.co/blog/assisted-generation), also known as "speculative | ||||
| sampling". These techniques try to guess multiple future tokens at once, often using a smaller "draft model", and then | ||||
| confirm these generations with the chat model. If the guesses are validated by the chat model, more than one token can | ||||
| be generated per forward pass, which greatly alleviates the bandwidth bottleneck and improves generation speed.   | ||||
|  | ||||
| Finally, we should also note the impact of "Mixture of Experts" (MoE) models here. Several popular chat models, | ||||
| such as Mixtral, Qwen-MoE and DBRX, are MoE models. In these models, not every parameter is active for every token generated. | ||||
| As a result, MoE models generally have much lower memory bandwidth requirements, even though their total size | ||||
| can be quite large. They can therefore be several times faster than a normal "dense" model of the same size. However, | ||||
| techniques like assisted generation are generally ineffective for these models because more parameters will become | ||||
| active with each new speculated token, which will negate the bandwidth and speed benefits that the MoE architecture | ||||
| provides. | ||||
|  | ||||
| > [!TIP] | ||||
| > Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token. | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	