mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 09:44:02 +08:00
Compare commits
2 Commits
remove_dat
...
v4.57.0
Author | SHA1 | Date | |
---|---|---|---|
8ac2b916b0 | |||
2ccc6cae21 |
@ -16,10 +16,9 @@
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
import glob
|
||||
from typing import Any, Optional
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@ -30,6 +29,7 @@ COMMON_ENV_VARIABLES = {
|
||||
"RUN_PIPELINE_TESTS": False,
|
||||
# will be adjust in `CircleCIJob.to_dict`.
|
||||
"RUN_FLAKY": True,
|
||||
"DISABLE_SAFETENSORS_CONVERSION": True,
|
||||
}
|
||||
# Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
|
||||
COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
|
||||
@ -82,15 +82,15 @@ class EmptyJob:
|
||||
@dataclass
|
||||
class CircleCIJob:
|
||||
name: str
|
||||
additional_env: Dict[str, Any] = None
|
||||
docker_image: List[Dict[str, str]] = None
|
||||
install_steps: List[str] = None
|
||||
additional_env: dict[str, Any] = None
|
||||
docker_image: list[dict[str, str]] = None
|
||||
install_steps: list[str] = None
|
||||
marker: Optional[str] = None
|
||||
parallelism: Optional[int] = 0
|
||||
pytest_num_workers: int = 8
|
||||
pytest_options: Dict[str, Any] = None
|
||||
pytest_options: dict[str, Any] = None
|
||||
resource_class: Optional[str] = "xlarge"
|
||||
tests_to_run: Optional[List[str]] = None
|
||||
tests_to_run: Optional[list[str]] = None
|
||||
num_test_files_per_worker: Optional[int] = 10
|
||||
# This should be only used for doctest job!
|
||||
command_timeout: Optional[int] = None
|
||||
@ -130,6 +130,12 @@ class CircleCIJob:
|
||||
|
||||
def to_dict(self):
|
||||
env = COMMON_ENV_VARIABLES.copy()
|
||||
if self.job_name != "tests_hub":
|
||||
# fmt: off
|
||||
# not critical
|
||||
env.update({"HF_TOKEN": "".join(["h", "f", "_", "H", "o", "d", "V", "u", "M", "q", "b", "R", "m", "t", "b", "z", "F", "Q", "O", "Q", "A", "J", "G", "D", "l", "V", "Q", "r", "R", "N", "w", "D", "M", "V", "C", "s", "d"])})
|
||||
# fmt: on
|
||||
|
||||
# Do not run tests decorated by @is_flaky on pull requests
|
||||
env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
|
||||
env.update(self.additional_env)
|
||||
@ -149,7 +155,7 @@ class CircleCIJob:
|
||||
# Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
|
||||
timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
|
||||
marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
|
||||
junit_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
|
||||
junit_flags = " -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
|
||||
joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS)
|
||||
repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'"
|
||||
parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
|
||||
@ -180,6 +186,7 @@ class CircleCIJob:
|
||||
# During the CircleCI docker images build time, we might already (or not) download the data.
|
||||
# If it's done already, the files are inside the directory `/test_data/`.
|
||||
{"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
|
||||
{"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
|
||||
{"run": {
|
||||
"name": "Run tests",
|
||||
"command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
|
||||
@ -200,9 +207,9 @@ class CircleCIJob:
|
||||
fi"""
|
||||
},
|
||||
},
|
||||
{"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
|
||||
{"run": {"name": "Failed tests: show reasons", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
|
||||
{"run": {"name": "Errors", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
|
||||
{"run": {"name": "Expand to show skipped tests", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
|
||||
{"run": {"name": "Failed tests: show reasons", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
|
||||
{"run": {"name": "Errors", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
|
||||
{"store_test_results": {"path": "test-results"}},
|
||||
{"store_artifacts": {"path": "test-results/junit.xml"}},
|
||||
{"store_artifacts": {"path": "reports"}},
|
||||
|
@ -1,5 +1,6 @@
|
||||
import re
|
||||
import argparse
|
||||
import re
|
||||
|
||||
|
||||
def parse_pytest_output(file_path):
|
||||
skipped_tests = {}
|
||||
|
1
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
1
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
@ -61,6 +61,7 @@ body:
|
||||
- Big Model Inference: @SunMarc
|
||||
- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
|
||||
- kernels: @MekkCyber @drbh
|
||||
- peft: @BenjaminBossan @githubnemo
|
||||
|
||||
Devices/Backends:
|
||||
|
||||
|
40
.github/PULL_REQUEST_TEMPLATE.md
vendored
40
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -39,20 +39,23 @@ members/contributors who may be interested in your PR.
|
||||
|
||||
Models:
|
||||
|
||||
- text models: @ArthurZucker
|
||||
- vision models: @amyeroberts, @qubvel
|
||||
- speech models: @eustlb
|
||||
- text models: @ArthurZucker @Cyrilvallez
|
||||
- vision models: @yonigozlan @molbap
|
||||
- audio models: @eustlb @ebezzam @vasqu
|
||||
- multimodal models: @zucchini-nlp
|
||||
- graph models: @clefourrier
|
||||
|
||||
Library:
|
||||
|
||||
- flax: @gante and @Rocketknight1
|
||||
- generate: @zucchini-nlp (visual-language models) or @gante (all others)
|
||||
- continuous batching: @remi-or @ArthurZucker @McPatate
|
||||
- pipelines: @Rocketknight1
|
||||
- tensorflow: @gante and @Rocketknight1
|
||||
- tokenizers: @ArthurZucker
|
||||
- trainer: @zach-huggingface, @SunMarc and @qgallouedec
|
||||
- chat templates: @Rocketknight1
|
||||
- tokenizers: @ArthurZucker and @itazap
|
||||
- trainer: @zach-huggingface @SunMarc
|
||||
- attention: @vasqu @ArthurZucker @CyrilVallez
|
||||
- model loading (from pretrained, etc): @CyrilVallez
|
||||
- distributed: @3outeille @ArthurZucker @S1ro1
|
||||
- CIs: @ydshieh
|
||||
|
||||
Integrations:
|
||||
|
||||
@ -60,20 +63,17 @@ Integrations:
|
||||
- ray/raytune: @richardliaw, @amogkam
|
||||
- Big Model Inference: @SunMarc
|
||||
- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
|
||||
- kernels: @MekkCyber @drbh
|
||||
- peft: @BenjaminBossan @githubnemo
|
||||
|
||||
Devices/Backends:
|
||||
|
||||
- AMD ROCm: @ivarflakstad
|
||||
- Intel XPU: @IlyasMoutawwakil
|
||||
- Ascend NPU: @ivarflakstad
|
||||
|
||||
Documentation: @stevhliu
|
||||
|
||||
HF projects:
|
||||
|
||||
- accelerate: [different repo](https://github.com/huggingface/accelerate)
|
||||
- datasets: [different repo](https://github.com/huggingface/datasets)
|
||||
- diffusers: [different repo](https://github.com/huggingface/diffusers)
|
||||
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
|
||||
|
||||
Maintained examples (not research project or legacy):
|
||||
|
||||
- Flax: @Rocketknight1
|
||||
- PyTorch: See Models above and tag the person corresponding to the modality of the example.
|
||||
- TensorFlow: @Rocketknight1
|
||||
Research projects are not maintained and should be taken as is.
|
||||
|
||||
-->
|
||||
|
8
.github/scripts/assign_reviewers.py
vendored
8
.github/scripts/assign_reviewers.py
vendored
@ -13,14 +13,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import github
|
||||
import json
|
||||
from github import Github
|
||||
import os
|
||||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import github
|
||||
from github import Github
|
||||
|
||||
|
||||
def pattern_to_regex(pattern):
|
||||
if pattern.startswith("/"):
|
||||
start_anchor = True
|
||||
|
130
.github/scripts/codeowners_for_review_action
vendored
130
.github/scripts/codeowners_for_review_action
vendored
@ -7,8 +7,8 @@ docs/ @stevhliu
|
||||
/docker/ @ydshieh @ArthurZucker
|
||||
|
||||
# More high-level globs catch cases when specific rules later don't apply
|
||||
/src/transformers/models/*/processing* @molbap @yonigozlan @qubvel
|
||||
/src/transformers/models/*/image_processing* @qubvel
|
||||
/src/transformers/models/*/processing* @molbap @yonigozlan
|
||||
/src/transformers/models/*/image_processing* @yonigozlan
|
||||
/src/transformers/models/*/image_processing_*_fast* @yonigozlan
|
||||
|
||||
# Owners of subsections of the library
|
||||
@ -186,65 +186,65 @@ trainer_utils.py @zach-huggingface @SunMarc
|
||||
/src/transformers/models/zamba/mod*_zamba* @ArthurZucker
|
||||
|
||||
# Vision models
|
||||
/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel
|
||||
/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel
|
||||
/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel
|
||||
/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel
|
||||
/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel
|
||||
/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel
|
||||
/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel
|
||||
/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel
|
||||
/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel
|
||||
/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel
|
||||
/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel
|
||||
/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel
|
||||
/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel
|
||||
/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel
|
||||
/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel
|
||||
/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel
|
||||
/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel
|
||||
/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel
|
||||
/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel
|
||||
/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel
|
||||
/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel
|
||||
/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel
|
||||
/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel
|
||||
/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel
|
||||
/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel
|
||||
/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel
|
||||
/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel
|
||||
/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel
|
||||
/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel
|
||||
/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel
|
||||
/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel
|
||||
/src/transformers/models/van/mod*_van* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel
|
||||
/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel
|
||||
/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel
|
||||
/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel
|
||||
/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel
|
||||
/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel
|
||||
/src/transformers/models/beit/mod*_beit* @yonigozlan @molbap
|
||||
/src/transformers/models/bit/mod*_bit* @yonigozlan @molbap
|
||||
/src/transformers/models/conditional_detr/mod*_conditional_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/convnext/mod*_convnext* @yonigozlan @molbap
|
||||
/src/transformers/models/convnextv2/mod*_convnextv2* @yonigozlan @molbap
|
||||
/src/transformers/models/cvt/mod*_cvt* @yonigozlan @molbap
|
||||
/src/transformers/models/deformable_detr/mod*_deformable_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/deit/mod*_deit* @yonigozlan @molbap
|
||||
/src/transformers/models/depth_anything/mod*_depth_anything* @yonigozlan @molbap
|
||||
/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @yonigozlan @molbap
|
||||
/src/transformers/models/deta/mod*_deta* @yonigozlan @molbap
|
||||
/src/transformers/models/detr/mod*_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/dinat/mod*_dinat* @yonigozlan @molbap
|
||||
/src/transformers/models/dinov2/mod*_dinov2* @yonigozlan @molbap
|
||||
/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @yonigozlan @molbap
|
||||
/src/transformers/models/dit/mod*_dit* @yonigozlan @molbap
|
||||
/src/transformers/models/dpt/mod*_dpt* @yonigozlan @molbap
|
||||
/src/transformers/models/efficientformer/mod*_efficientformer* @yonigozlan @molbap
|
||||
/src/transformers/models/efficientnet/mod*_efficientnet* @yonigozlan @molbap
|
||||
/src/transformers/models/focalnet/mod*_focalnet* @yonigozlan @molbap
|
||||
/src/transformers/models/glpn/mod*_glpn* @yonigozlan @molbap
|
||||
/src/transformers/models/hiera/mod*_hiera* @yonigozlan @molbap
|
||||
/src/transformers/models/ijepa/mod*_ijepa* @yonigozlan @molbap
|
||||
/src/transformers/models/imagegpt/mod*_imagegpt* @yonigozlan @molbap
|
||||
/src/transformers/models/levit/mod*_levit* @yonigozlan @molbap
|
||||
/src/transformers/models/mask2former/mod*_mask2former* @yonigozlan @molbap
|
||||
/src/transformers/models/maskformer/mod*_maskformer* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilevit/mod*_mobilevit* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @yonigozlan @molbap
|
||||
/src/transformers/models/nat/mod*_nat* @yonigozlan @molbap
|
||||
/src/transformers/models/poolformer/mod*_poolformer* @yonigozlan @molbap
|
||||
/src/transformers/models/pvt/mod*_pvt* @yonigozlan @molbap
|
||||
/src/transformers/models/pvt_v2/mod*_pvt_v2* @yonigozlan @molbap
|
||||
/src/transformers/models/regnet/mod*_regnet* @yonigozlan @molbap
|
||||
/src/transformers/models/resnet/mod*_resnet* @yonigozlan @molbap
|
||||
/src/transformers/models/rt_detr/mod*_rt_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/segformer/mod*_segformer* @yonigozlan @molbap
|
||||
/src/transformers/models/seggpt/mod*_seggpt* @yonigozlan @molbap
|
||||
/src/transformers/models/superpoint/mod*_superpoint* @yonigozlan @molbap
|
||||
/src/transformers/models/swiftformer/mod*_swiftformer* @yonigozlan @molbap
|
||||
/src/transformers/models/swin/mod*_swin* @yonigozlan @molbap
|
||||
/src/transformers/models/swinv2/mod*_swinv2* @yonigozlan @molbap
|
||||
/src/transformers/models/swin2sr/mod*_swin2sr* @yonigozlan @molbap
|
||||
/src/transformers/models/table_transformer/mod*_table_transformer* @yonigozlan @molbap
|
||||
/src/transformers/models/textnet/mod*_textnet* @yonigozlan @molbap
|
||||
/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @yonigozlan @molbap
|
||||
/src/transformers/models/upernet/mod*_upernet* @yonigozlan @molbap
|
||||
/src/transformers/models/van/mod*_van* @yonigozlan @molbap
|
||||
/src/transformers/models/vit/mod*_vit* @yonigozlan @molbap
|
||||
/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @yonigozlan @molbap
|
||||
/src/transformers/models/vitdet/mod*_vitdet* @yonigozlan @molbap
|
||||
/src/transformers/models/vit_mae/mod*_vit_mae* @yonigozlan @molbap
|
||||
/src/transformers/models/vitmatte/mod*_vitmatte* @yonigozlan @molbap
|
||||
/src/transformers/models/vit_msn/mod*_vit_msn* @yonigozlan @molbap
|
||||
/src/transformers/models/vitpose/mod*_vitpose* @yonigozlan @molbap
|
||||
/src/transformers/models/yolos/mod*_yolos* @yonigozlan @molbap
|
||||
/src/transformers/models/zoedepth/mod*_zoedepth* @yonigozlan @molbap
|
||||
|
||||
# Audio models
|
||||
/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb
|
||||
@ -304,7 +304,7 @@ trainer_utils.py @zach-huggingface @SunMarc
|
||||
/src/transformers/models/donut/mod*_donut* @zucchini-nlp
|
||||
/src/transformers/models/flava/mod*_flava* @zucchini-nlp
|
||||
/src/transformers/models/git/mod*_git* @zucchini-nlp
|
||||
/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel
|
||||
/src/transformers/models/grounding_dino/mod*_grounding_dino* @yonigozlan
|
||||
/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp
|
||||
/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp
|
||||
/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp
|
||||
@ -326,10 +326,10 @@ trainer_utils.py @zach-huggingface @SunMarc
|
||||
/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp
|
||||
/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp
|
||||
/src/transformers/models/nougat/mod*_nougat* @NielsRogge
|
||||
/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan
|
||||
/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @yonigozlan
|
||||
/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp
|
||||
/src/transformers/models/owlvit/mod*_owlvit* @qubvel
|
||||
/src/transformers/models/owlv2/mod*_owlv2* @qubvel
|
||||
/src/transformers/models/owlvit/mod*_owlvit* @yonigozlan
|
||||
/src/transformers/models/owlv2/mod*_owlv2* @yonigozlan
|
||||
/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap
|
||||
/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp
|
||||
/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp
|
||||
|
85
.github/workflows/benchmark_v2.yml
vendored
Normal file
85
.github/workflows/benchmark_v2.yml
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
name: Benchmark v2 Framework
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
runner:
|
||||
description: 'GH Actions runner group to use'
|
||||
required: true
|
||||
type: string
|
||||
container_image:
|
||||
description: 'Docker image to use'
|
||||
required: true
|
||||
type: string
|
||||
container_options:
|
||||
description: 'Container options to use'
|
||||
required: true
|
||||
type: string
|
||||
commit_sha:
|
||||
description: 'Commit SHA to benchmark'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
run_id:
|
||||
description: 'Custom run ID for organizing results (auto-generated if not provided)'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
benchmark_repo_id:
|
||||
description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
|
||||
# This token is created under the bot `hf-transformers-bot`.
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
|
||||
jobs:
|
||||
benchmark-v2:
|
||||
name: Benchmark v2
|
||||
runs-on: ${{ inputs.runner }}
|
||||
if: |
|
||||
(github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) ||
|
||||
(github.event_name == 'schedule')
|
||||
container:
|
||||
image: ${{ inputs.container_image }}
|
||||
options: ${{ inputs.container_options }}
|
||||
steps:
|
||||
- name: Get repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Install benchmark dependencies
|
||||
run: |
|
||||
python3 -m pip install -r benchmark_v2/requirements.txt
|
||||
|
||||
- name: Reinstall transformers in edit mode
|
||||
run: |
|
||||
python3 -m pip uninstall -y transformers
|
||||
python3 -m pip install -e ".[torch]"
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
python3 -m pip list
|
||||
python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
|
||||
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
|
||||
python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true
|
||||
nvidia-smi || true
|
||||
|
||||
- name: Run benchmark v2
|
||||
working-directory: benchmark_v2
|
||||
run: |
|
||||
echo "Running benchmarks"
|
||||
python3 run_benchmarks.py \
|
||||
--commit-id '${{ inputs.commit_sha || github.sha }}' \
|
||||
--run-id '${{ inputs.run_id }}' \
|
||||
--push-to-hub '${{ inputs.benchmark_repo_id}}' \
|
||||
--token '${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}' \
|
||||
--log-level INFO
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
21
.github/workflows/benchmark_v2_a10_caller.yml
vendored
Normal file
21
.github/workflows/benchmark_v2_a10_caller.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
name: Benchmark v2 Scheduled Runner - A10 Single-GPU
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run daily at 16:30 UTC
|
||||
- cron: "30 16 * * *"
|
||||
pull_request:
|
||||
types: [ opened, labeled, reopened, synchronize ]
|
||||
|
||||
jobs:
|
||||
benchmark-v2-default:
|
||||
name: Benchmark v2 - Default Models
|
||||
uses: ./.github/workflows/benchmark_v2.yml
|
||||
with:
|
||||
runner: aws-g5-4xlarge-cache-use1-public-80
|
||||
container_image: huggingface/transformers-pytorch-gpu
|
||||
container_options: --gpus all --privileged --ipc host --shm-size "16gb"
|
||||
commit_sha: ${{ github.sha }}
|
||||
run_id: ${{ github.run_id }}
|
||||
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
|
||||
secrets: inherit
|
21
.github/workflows/benchmark_v2_mi325_caller.yml
vendored
Normal file
21
.github/workflows/benchmark_v2_mi325_caller.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
name: Benchmark v2 Scheduled Runner - MI325 Single-GPU
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run daily at 16:30 UTC
|
||||
- cron: "30 16 * * *"
|
||||
pull_request:
|
||||
types: [ opened, labeled, reopened, synchronize ]
|
||||
|
||||
jobs:
|
||||
benchmark-v2-default:
|
||||
name: Benchmark v2 - Default Models
|
||||
uses: ./.github/workflows/benchmark_v2.yml
|
||||
with:
|
||||
runner: amd-mi325-ci-1gpu
|
||||
container_image: huggingface/transformers-pytorch-amd-gpu
|
||||
container_options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache
|
||||
commit_sha: ${{ github.sha }}
|
||||
run_id: ${{ github.run_id }}
|
||||
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
|
||||
secrets: inherit
|
3
.github/workflows/build-docker-images.yml
vendored
3
.github/workflows/build-docker-images.yml
vendored
@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- build_ci_docker_image*
|
||||
repository_dispatch:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
image_postfix:
|
||||
@ -221,7 +222,7 @@ jobs:
|
||||
latest-pytorch-amd:
|
||||
name: "Latest PyTorch (AMD) [dev]"
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
group: aws-highcpu-32-priv
|
||||
steps:
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
|
14
.github/workflows/build_documentation.yml
vendored
14
.github/workflows/build_documentation.yml
vendored
@ -16,8 +16,20 @@ jobs:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: transformers
|
||||
notebook_folder: transformers_doc
|
||||
languages: ar de en es fr hi it ko pt tr zh ja te
|
||||
languages: en
|
||||
custom_container: huggingface/transformers-doc-builder
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
|
||||
|
||||
build_other_lang:
|
||||
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: transformers
|
||||
notebook_folder: transformers_doc
|
||||
languages: ar de es fr hi it ja ko pt zh
|
||||
custom_container: huggingface/transformers-doc-builder
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
|
35
.github/workflows/model_jobs.yml
vendored
35
.github/workflows/model_jobs.yml
vendored
@ -128,28 +128,47 @@ jobs:
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
echo "machine_type=$machine_type" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create report directory if it doesn't exist
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
|
||||
ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
run: |
|
||||
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
|
||||
ls -la
|
||||
# Extract the exit code from the output file
|
||||
EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
|
||||
exit ${EXIT_CODE:-1}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
# This step is only to show information on Github Actions log.
|
||||
# Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
- name: Captured information
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||
cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
|
||||
|
||||
- name: Copy test_outputs.txt
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
|
||||
collated_reports:
|
||||
name: Collated Reports
|
||||
|
@ -14,7 +14,7 @@ permissions: {}
|
||||
jobs:
|
||||
get-pr-number:
|
||||
name: Get PR number
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
|
||||
uses: ./.github/workflows/get-pr-number.yml
|
||||
|
||||
get-pr-info:
|
||||
|
2
.github/workflows/self-comment-ci.yml
vendored
2
.github/workflows/self-comment-ci.yml
vendored
@ -29,7 +29,7 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
name: Get PR number
|
||||
# For security: only allow team members to run
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||
outputs:
|
||||
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
||||
steps:
|
||||
|
@ -20,7 +20,7 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
@ -33,7 +33,7 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
@ -46,7 +46,7 @@ jobs:
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
@ -59,7 +59,7 @@ jobs:
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
|
@ -3,7 +3,7 @@ name: Self-hosted runner scale set (AMD mi355 scheduled CI caller)
|
||||
# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
|
||||
# For example, 1gpu : amd-mi355-ci-1gpu
|
||||
# 2gpu : amd-mi355-ci-2gpu
|
||||
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
|
||||
@ -20,7 +20,7 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
@ -32,7 +32,7 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
@ -44,7 +44,7 @@ jobs:
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
@ -53,10 +53,10 @@ jobs:
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
||||
with:
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -13,6 +13,7 @@ tests/fixtures/cached_*_text.txt
|
||||
logs/
|
||||
lightning_logs/
|
||||
lang_code_data/
|
||||
reports/
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
|
@ -278,13 +278,14 @@ are working on it).<br>
|
||||
useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
|
||||
☐ Make sure existing tests pass.<br>
|
||||
☐ If adding a new feature, also add tests for it.<br>
|
||||
- If you are adding a new model, make sure you use
|
||||
|
||||
- If you are adding a new model, make sure you use
|
||||
`ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
|
||||
- If you are adding new `@slow` tests, make sure they pass using
|
||||
- If you are adding new `@slow` tests, make sure they pass using
|
||||
`RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
|
||||
- If you are adding a new tokenizer, write tests and make sure
|
||||
- If you are adding a new tokenizer, write tests and make sure
|
||||
`RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
|
||||
- CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
|
||||
- CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
|
||||
|
||||
☐ All public methods must have informative docstrings (see
|
||||
[`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
|
||||
@ -340,6 +341,7 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
|
||||
```
|
||||
|
||||
Like the slow tests, there are other environment variables available which are not enabled by default during testing:
|
||||
|
||||
- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
|
||||
|
||||
More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
|
||||
|
@ -38,7 +38,6 @@ In particular all "Please explain" questions or objectively very user-specific f
|
||||
|
||||
* "How to train T5 on De->En translation?"
|
||||
|
||||
|
||||
## The GitHub Issues
|
||||
|
||||
Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
|
||||
@ -247,7 +246,6 @@ You are not required to read the following guidelines before opening an issue. H
|
||||
|
||||
Try not use italics and bold text too much as these often make the text more difficult to read.
|
||||
|
||||
|
||||
12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
|
||||
|
||||
To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
|
||||
@ -257,7 +255,6 @@ You are not required to read the following guidelines before opening an issue. H
|
||||
1. https://github.com/huggingface/transformers/issues/9257
|
||||
2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
|
||||
|
||||
|
||||
13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
|
||||
|
||||
But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
|
||||
|
16
README.md
16
README.md
@ -48,9 +48,11 @@ limitations under the License.
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_it.md">Italiano</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_bn.md">বাংলা</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
@ -62,12 +64,11 @@ limitations under the License.
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
|
||||
</h3>
|
||||
|
||||
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
|
||||
vision, audio, video, and multimodal model, for both inference and training.
|
||||
|
||||
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
|
||||
vision, audio, video, and multimodal model, for both inference and training.
|
||||
|
||||
It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
|
||||
pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
|
||||
It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
|
||||
pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
|
||||
frameworks (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, ...), inference engines (vLLM, SGLang, TGI, ...),
|
||||
and adjacent modeling libraries (llama.cpp, mlx, ...) which leverage the model definition from `transformers`.
|
||||
|
||||
@ -110,10 +111,10 @@ git clone https://github.com/huggingface/transformers.git
|
||||
cd transformers
|
||||
|
||||
# pip
|
||||
pip install .[torch]
|
||||
pip install '.[torch]'
|
||||
|
||||
# uv
|
||||
uv pip install .[torch]
|
||||
uv pip install '.[torch]'
|
||||
```
|
||||
|
||||
## Quickstart
|
||||
@ -193,7 +194,6 @@ pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.pn
|
||||
<details>
|
||||
<summary>Visual question answering</summary>
|
||||
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
|
||||
</h3>
|
||||
|
@ -6,7 +6,7 @@ developers, researchers, students, professors, engineers, and anyone else to bui
|
||||
|
||||
In this list, we showcase incredibly impactful and novel projects that have pushed the field forward. We celebrate
|
||||
100 of these projects as we reach the milestone of 100k stars as a community; but we're very open to pull requests
|
||||
adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
|
||||
adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
|
||||
to add it.
|
||||
|
||||
## [gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||
@ -49,7 +49,7 @@ Keywords: LLMs, Large Language Models, Agents, Chains
|
||||
|
||||
[LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retrieval mechanisms to perform different LLM tasks and obtain knowledge-augmented results.
|
||||
|
||||
Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation
|
||||
Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation
|
||||
|
||||
## [ParlAI](https://github.com/facebookresearch/ParlAI)
|
||||
|
||||
@ -257,7 +257,7 @@ Stable-Dreamfusion is a pytorch implementation of the text-to-3D model Dreamfusi
|
||||
Keywords: Text-to-3D, Stable Diffusion
|
||||
|
||||
## [txtai](https://github.com/neuml/txtai)
|
||||
|
||||
|
||||
[txtai](https://github.com/neuml/txtai) is an open-source platform for semantic search and workflows powered by language models. txtai builds embeddings databases, which are a union of vector indexes and relational databases enabling similarity search with SQL. Semantic workflows connect language models together into unified applications.
|
||||
|
||||
Keywords: Semantic search, LLM
|
||||
@ -309,8 +309,8 @@ Keywords: OCR, LaTeX, Math formula
|
||||
|
||||
OpenCLIP is an open source implementation of OpenAI's CLIP.
|
||||
|
||||
The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift.
|
||||
The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset.
|
||||
The goal of this repository is to enable training models with contrastive image-text supervision, and to investigate their properties such as robustness to distribution shift.
|
||||
The starting point is an implementation of CLIP that matches the accuracy of the original CLIP models when trained on the same dataset.
|
||||
|
||||
Specifically, a ResNet-50 model trained with this codebase on OpenAI's 15 million image subset of YFCC achieves 32.7% top-1 accuracy on ImageNet.
|
||||
|
||||
@ -596,7 +596,7 @@ Keywords: Data-Centric AI, Data Quality, Noisy Labels, Outlier Detection, Active
|
||||
|
||||
## [BentoML](https://github.com/bentoml/BentoML)
|
||||
|
||||
[BentoML](https://github.com/bentoml) is the unified framework for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models.
|
||||
[BentoML](https://github.com/bentoml) is the unified framework for building, shipping, and scaling production-ready AI applications incorporating traditional ML, pre-trained AI models, Generative and Large Language Models.
|
||||
All Hugging Face models and pipelines can be seamlessly integrated into BentoML applications, enabling the running of models on the most suitable hardware and independent scaling based on usage.
|
||||
|
||||
Keywords: BentoML, Framework, Deployment, AI Applications
|
||||
@ -606,4 +606,3 @@ Keywords: BentoML, Framework, Deployment, AI Applications
|
||||
[LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory) offers a user-friendly fine-tuning framework that incorporates PEFT. The repository includes training(fine-tuning) and inference examples for LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, and other LLMs. A ChatGLM version is also available in [ChatGLM-Efficient-Tuning](https://github.com/hiyouga/ChatGLM-Efficient-Tuning).
|
||||
|
||||
Keywords: PEFT, fine-tuning, LLaMA-2, ChatGLM, Qwen
|
||||
|
||||
|
@ -21,6 +21,46 @@ python run_benchmarks.py \
|
||||
--num-tokens-to-generate 200
|
||||
```
|
||||
|
||||
### Uploading Results to HuggingFace Dataset
|
||||
|
||||
You can automatically upload benchmark results to a HuggingFace Dataset for tracking and analysis:
|
||||
|
||||
```bash
|
||||
# Upload to a public dataset with auto-generated run ID
|
||||
python run_benchmarks.py --upload-to-hub username/benchmark-results
|
||||
|
||||
# Upload with a custom run ID for easy identification
|
||||
python run_benchmarks.py --upload-to-hub username/benchmark-results --run-id experiment_v1
|
||||
|
||||
# Upload with custom HuggingFace token (if not set in environment)
|
||||
python run_benchmarks.py --upload-to-hub username/benchmark-results --token hf_your_token_here
|
||||
```
|
||||
|
||||
**Dataset Directory Structure:**
|
||||
```
|
||||
dataset_name/
|
||||
├── 2025-01-15/
|
||||
│ ├── runs/ # Non-scheduled runs (manual, PR, etc.)
|
||||
│ │ └── 123-1245151651/ # GitHub run number and ID
|
||||
│ │ └── benchmark_results/
|
||||
│ │ ├── benchmark_summary_20250115_143022.json
|
||||
│ │ └── model-name/
|
||||
│ │ └── model-name_benchmark_20250115_143022.json
|
||||
│ └── benchmark_results_abc123de/ # Scheduled runs (daily CI)
|
||||
│ ├── benchmark_summary_20250115_143022.json
|
||||
│ └── model-name/
|
||||
│ └── model-name_benchmark_20250115_143022.json
|
||||
└── 2025-01-16/
|
||||
└── ...
|
||||
```
|
||||
|
||||
**Authentication for Uploads:**
|
||||
|
||||
For uploading results, you need a HuggingFace token with write permissions to the target dataset. You can provide the token in several ways (in order of precedence):
|
||||
|
||||
1. Command line: `--token hf_your_token_here`
|
||||
3. Environment variable: `HF_TOKEN`
|
||||
|
||||
### Running Specific Benchmarks
|
||||
|
||||
```bash
|
||||
|
@ -20,7 +20,6 @@ import torch
|
||||
from benchmark_framework import ModelBenchmark
|
||||
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
|
@ -3,4 +3,5 @@ psutil>=5.8.0
|
||||
gpustat>=1.0.0
|
||||
torch>=2.0.0
|
||||
transformers>=4.30.0
|
||||
datasets>=2.10.0
|
||||
datasets>=2.10.0
|
||||
huggingface_hub>=0.16.0
|
@ -24,6 +24,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
@ -160,7 +161,12 @@ def run_single_benchmark(
|
||||
return None
|
||||
|
||||
|
||||
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
|
||||
def generate_summary_report(
|
||||
output_dir: str,
|
||||
benchmark_results: dict[str, Any],
|
||||
logger: logging.Logger,
|
||||
benchmark_run_uuid: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Generate a summary report of all benchmark runs."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
|
||||
@ -168,6 +174,7 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
|
||||
summary_data = {
|
||||
"run_metadata": {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"benchmark_run_uuid": benchmark_run_uuid,
|
||||
"total_benchmarks": len(benchmark_results),
|
||||
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
|
||||
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
|
||||
@ -183,9 +190,114 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
|
||||
return summary_file
|
||||
|
||||
|
||||
def upload_results_to_hf_dataset(
|
||||
output_dir: str,
|
||||
summary_file: str,
|
||||
dataset_name: str,
|
||||
run_id: Optional[str] = None,
|
||||
token: Optional[str] = None,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Upload benchmark results to a HuggingFace Dataset.
|
||||
Based on upload_collated_report() from utils/collated_reports.py
|
||||
Args:
|
||||
output_dir: Local output directory containing results
|
||||
summary_file: Path to the summary file
|
||||
dataset_name: Name of the HuggingFace dataset to upload to
|
||||
run_id: Unique run identifier (if None, will generate one)
|
||||
token: HuggingFace token for authentication (if None, will use environment variables)
|
||||
logger: Logger instance
|
||||
Returns:
|
||||
The run_id used for the upload, None if upload failed
|
||||
"""
|
||||
if logger is None:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import os
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
api = HfApi()
|
||||
|
||||
if run_id is None:
|
||||
github_run_number = os.getenv("GITHUB_RUN_NUMBER")
|
||||
github_run_id = os.getenv("GITHUB_RUN_ID")
|
||||
if github_run_number and github_run_id:
|
||||
run_id = f"{github_run_number}-{github_run_id}"
|
||||
|
||||
date_folder = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
github_event_name = os.getenv("GITHUB_EVENT_NAME")
|
||||
if github_event_name != "schedule":
|
||||
# Non-scheduled runs go under a runs subfolder
|
||||
repo_path = f"{date_folder}/runs/{run_id}/benchmark_results"
|
||||
else:
|
||||
# Scheduled runs go directly under the date
|
||||
repo_path = f"{date_folder}/{run_id}/benchmark_results"
|
||||
|
||||
logger.info(f"Uploading benchmark results to dataset '{dataset_name}' at path '{repo_path}'")
|
||||
|
||||
try:
|
||||
# Upload all files in the output directory
|
||||
from pathlib import Path
|
||||
|
||||
output_path = Path(output_dir)
|
||||
|
||||
for file_path in output_path.rglob("*"):
|
||||
if file_path.is_file():
|
||||
# Calculate relative path from output_dir
|
||||
relative_path = file_path.relative_to(output_path)
|
||||
path_in_repo = f"{repo_path}/{relative_path}"
|
||||
|
||||
logger.debug(f"Uploading {file_path} to {path_in_repo}")
|
||||
|
||||
api.upload_file(
|
||||
path_or_fileobj=str(file_path),
|
||||
path_in_repo=path_in_repo,
|
||||
repo_id=dataset_name,
|
||||
repo_type="dataset",
|
||||
token=token,
|
||||
commit_message=f"Upload benchmark results for run {run_id}",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully uploaded results to: https://huggingface.co/datasets/{dataset_name}/tree/main/{repo_path}"
|
||||
)
|
||||
|
||||
return run_id
|
||||
|
||||
except Exception as upload_error:
|
||||
logger.error(f"Failed to upload results: {upload_error}")
|
||||
import traceback
|
||||
|
||||
logger.debug(traceback.format_exc())
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the benchmarking script."""
|
||||
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
|
||||
# Generate a unique UUID for this benchmark run
|
||||
benchmark_run_uuid = str(uuid.uuid4())[:8]
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run all benchmarks in the ./benches directory",
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run all available benchmarks
|
||||
python3 run_benchmarks.py
|
||||
|
||||
# Run with specific model and upload to HuggingFace Dataset
|
||||
python3 run_benchmarks.py --model-id meta-llama/Llama-2-7b-hf --upload-to-hf username/benchmark-results
|
||||
|
||||
# Run with custom run ID and upload to HuggingFace Dataset
|
||||
python3 run_benchmarks.py --run-id experiment_v1 --upload-to-hf org/benchmarks
|
||||
|
||||
# Run only specific benchmarks with file logging
|
||||
python3 run_benchmarks.py --include llama --enable-file-logging
|
||||
""", # noqa: W293
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
@ -228,20 +340,35 @@ def main():
|
||||
|
||||
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
|
||||
|
||||
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
|
||||
|
||||
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
|
||||
|
||||
parser.add_argument(
|
||||
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push-to-hub",
|
||||
type=str,
|
||||
help="Upload results to HuggingFace Dataset (provide dataset name, e.g., 'username/benchmark-results')",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--run-id", type=str, help="Custom run ID for organizing results (if not provided, will generate a unique ID)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--token",
|
||||
type=str,
|
||||
help="HuggingFace token for dataset uploads (if not provided, will use HF_TOKEN environment variable)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
logger = setup_logging(args.log_level, args.enable_file_logging)
|
||||
|
||||
logger.info("Starting benchmark discovery and execution")
|
||||
logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
|
||||
logger.info(f"Output directory: {args.output_dir}")
|
||||
logger.info(f"Benches directory: {args.benches_dir}")
|
||||
|
||||
@ -286,9 +413,6 @@ def main():
|
||||
if args.model_id:
|
||||
benchmark_kwargs["model_id"] = args.model_id
|
||||
|
||||
# Add enable_mock flag for mock benchmark
|
||||
benchmark_kwargs["enable_mock"] = args.enable_mock
|
||||
|
||||
# Add commit_id if provided
|
||||
if args.commit_id:
|
||||
benchmark_kwargs["commit_id"] = args.commit_id
|
||||
@ -306,7 +430,28 @@ def main():
|
||||
successful_count += 1
|
||||
|
||||
# Generate summary report
|
||||
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger)
|
||||
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger, benchmark_run_uuid)
|
||||
|
||||
# Upload results to HuggingFace Dataset if requested
|
||||
upload_run_id = None
|
||||
if args.push_to_hub:
|
||||
logger.info("=" * 60)
|
||||
logger.info("UPLOADING TO HUGGINGFACE DATASET")
|
||||
logger.info("=" * 60)
|
||||
# Use provided run_id or fallback to benchmark run UUID
|
||||
effective_run_id = args.run_id or benchmark_run_uuid
|
||||
upload_run_id = upload_results_to_hf_dataset(
|
||||
output_dir=args.output_dir,
|
||||
summary_file=summary_file,
|
||||
dataset_name=args.push_to_hub,
|
||||
run_id=effective_run_id,
|
||||
token=args.token,
|
||||
logger=logger,
|
||||
)
|
||||
if upload_run_id:
|
||||
logger.info(f"Upload completed with run ID: {upload_run_id}")
|
||||
else:
|
||||
logger.warning("Upload failed - continuing with local results")
|
||||
|
||||
# Final summary
|
||||
total_benchmarks = len(filtered_benchmarks)
|
||||
@ -321,6 +466,16 @@ def main():
|
||||
logger.info(f"Output directory: {args.output_dir}")
|
||||
logger.info(f"Summary report: {summary_file}")
|
||||
|
||||
if args.push_to_hub:
|
||||
if upload_run_id:
|
||||
logger.info(f"HuggingFace Dataset: {args.push_to_hub}")
|
||||
logger.info(f"Run ID: {upload_run_id}")
|
||||
logger.info(
|
||||
f"View results: https://huggingface.co/datasets/{args.push_to_hub}/tree/main/{datetime.now().strftime('%Y-%m-%d')}/runs/{upload_run_id}"
|
||||
)
|
||||
else:
|
||||
logger.warning("Upload to HuggingFace Dataset failed")
|
||||
|
||||
if failed_count > 0:
|
||||
logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
|
||||
return 1
|
||||
|
@ -54,7 +54,6 @@ NOT_DEVICE_TESTS = {
|
||||
"test_gradient_checkpointing_backward_compatibility",
|
||||
"test_gradient_checkpointing_enable_disable",
|
||||
"test_torch_save_load",
|
||||
"test_initialization",
|
||||
"test_forward_signature",
|
||||
"test_model_get_set_embeddings",
|
||||
"test_model_main_input_name",
|
||||
@ -64,8 +63,7 @@ NOT_DEVICE_TESTS = {
|
||||
"test_load_save_without_tied_weights",
|
||||
"test_tied_weights_keys",
|
||||
"test_model_weights_reload_no_missing_tied_weights",
|
||||
"test_mismatched_shapes_have_properly_initialized_weights",
|
||||
"test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
|
||||
"test_can_load_ignoring_mismatched_shapes",
|
||||
"test_model_is_small",
|
||||
"test_tf_from_pt_safetensors",
|
||||
"test_flax_from_pt_safetensors",
|
||||
@ -93,6 +91,8 @@ def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
|
||||
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
|
||||
|
||||
os.environ['DISABLE_SAFETENSORS_CONVERSION'] = 'true'
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(items):
|
||||
for item in items:
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
USER root
|
||||
ARG REF=main
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -38,3 +38,10 @@ RUN python3 -m pip uninstall -y kernels
|
||||
|
||||
# On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails
|
||||
RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
|
||||
|
||||
# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
|
||||
RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
|
||||
cd flash-attention && \
|
||||
GPU_ARCHS="gfx942" python setup.py install
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
|
||||
FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
@ -9,9 +9,9 @@ SHELL ["sh", "-lc"]
|
||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||
# to be used as arguments for docker build (so far).
|
||||
|
||||
ARG PYTORCH='2.6.0'
|
||||
ARG PYTORCH='2.8.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu121'
|
||||
ARG CUDA='cu126'
|
||||
# Disable kernel mapping for quantization tests
|
||||
ENV DISABLE_KERNEL_MAPPING=1
|
||||
|
||||
@ -30,31 +30,20 @@ RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio tor
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
# needed in bnb and awq
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
||||
# Add bitsandbytes for mixed int8 testing
|
||||
RUN python3 -m pip install --no-cache-dir bitsandbytes
|
||||
|
||||
# Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility
|
||||
RUN python3 -m pip install lm_eval
|
||||
RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation
|
||||
|
||||
# Add optimum for gptq quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
|
||||
|
||||
# Add PEFT
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
|
||||
|
||||
# Add aqlm for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
|
||||
# needed in bnb and awq
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
||||
# Add vptq for quantization testing
|
||||
RUN pip install vptq
|
||||
# Add bitsandbytes
|
||||
RUN python3 -m pip install --no-cache-dir bitsandbytes
|
||||
|
||||
# Add spqr for quantization testing
|
||||
# Commented for now as No matching distribution found we need to reach out to the authors
|
||||
# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
|
||||
# # Add gptqmodel
|
||||
# RUN python3 -m pip install --no-cache-dir gptqmodel
|
||||
|
||||
# Add hqq for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir hqq
|
||||
@ -63,25 +52,11 @@ RUN python3 -m pip install --no-cache-dir hqq
|
||||
RUN python3 -m pip install --no-cache-dir gguf
|
||||
|
||||
# Add autoawq for quantization testing
|
||||
# New release v0.2.8
|
||||
RUN python3 -m pip install --no-cache-dir autoawq[kernels]
|
||||
|
||||
# Add quanto for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir optimum-quanto
|
||||
|
||||
# Add eetq for quantization testing
|
||||
RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .
|
||||
|
||||
# # Add flute-kernel and fast_hadamard_transform for quantization testing
|
||||
# # Commented for now as they cause issues with the build
|
||||
# # TODO: create a new workflow to test them
|
||||
# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
|
||||
# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
|
||||
|
||||
# Add fp-quant for quantization testing
|
||||
# Requires py3.11 but our CI runs on 3.9
|
||||
# RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
|
||||
|
||||
# Add compressed-tensors for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir compressed-tensors
|
||||
|
||||
@ -89,7 +64,10 @@ RUN python3 -m pip install --no-cache-dir compressed-tensors
|
||||
RUN python3 -m pip install --no-cache-dir amd-quark
|
||||
|
||||
# Add AutoRound for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
|
||||
RUN python3 -m pip install --no-cache-dir auto-round
|
||||
|
||||
# Add torchao for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir torchao
|
||||
|
||||
# Add transformers in editable mode
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
|
||||
@ -103,3 +81,27 @@ RUN python3 -m pip uninstall -y flash-attn
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
RUN cd transformers && python3 setup.py develop
|
||||
|
||||
# Add fp-quant for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
|
||||
|
||||
# Low usage or incompatible lib, will enable later on
|
||||
|
||||
# # Add aqlm for quantization testing
|
||||
# RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
|
||||
|
||||
# # Add vptq for quantization testing
|
||||
# RUN pip install vptq
|
||||
|
||||
# Add spqr for quantization testing
|
||||
# Commented for now as No matching distribution found we need to reach out to the authors
|
||||
# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
|
||||
|
||||
# # Add eetq for quantization testing
|
||||
# RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .
|
||||
|
||||
# # Add flute-kernel and fast_hadamard_transform for quantization testing
|
||||
# # Commented for now as they cause issues with the build
|
||||
# # TODO: create a new workflow to test them
|
||||
# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
|
||||
# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
|
||||
|
@ -50,7 +50,7 @@ Begin translating the text!
|
||||
|
||||
1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website.
|
||||
|
||||
- If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections.
|
||||
- If the `_toctree.yml` file doesn't exist for your language, create one by copying the English version and removing unrelated sections.
|
||||
- Ensure it is placed in the `docs/source/LANG-ID/` directory.
|
||||
|
||||
Here’s an example structure for the `_toctree.yml` file:
|
||||
|
@ -307,6 +307,8 @@
|
||||
title: Glossary
|
||||
- local: philosophy
|
||||
title: Philosophy
|
||||
- local: models_timeline
|
||||
title: Models Timeline
|
||||
- local: notebooks
|
||||
title: Notebooks with examples
|
||||
- local: community
|
||||
@ -411,6 +413,8 @@
|
||||
title: Blenderbot Small
|
||||
- local: model_doc/bloom
|
||||
title: BLOOM
|
||||
- local: model_doc/blt
|
||||
title: BLT
|
||||
- local: model_doc/bort
|
||||
title: BORT
|
||||
- local: model_doc/byt5
|
||||
@ -441,6 +445,8 @@
|
||||
title: DeBERTa
|
||||
- local: model_doc/deberta-v2
|
||||
title: DeBERTa-v2
|
||||
- local: model_doc/deepseek_v2
|
||||
title: DeepSeek-V2
|
||||
- local: model_doc/deepseek_v3
|
||||
title: DeepSeek-V3
|
||||
- local: model_doc/dialogpt
|
||||
@ -763,12 +769,6 @@
|
||||
title: D-FINE
|
||||
- local: model_doc/dab-detr
|
||||
title: DAB-DETR
|
||||
- local: model_doc/deepseek_v2
|
||||
title: DeepSeek-V2
|
||||
- local: model_doc/deepseek_vl
|
||||
title: DeepseekVL
|
||||
- local: model_doc/deepseek_vl_hybrid
|
||||
title: DeepseekVLHybrid
|
||||
- local: model_doc/deformable_detr
|
||||
title: Deformable DETR
|
||||
- local: model_doc/deit
|
||||
@ -851,10 +851,16 @@
|
||||
title: RT-DETR
|
||||
- local: model_doc/rt_detr_v2
|
||||
title: RT-DETRv2
|
||||
- local: model_doc/sam2
|
||||
title: SAM2
|
||||
- local: model_doc/segformer
|
||||
title: SegFormer
|
||||
- local: model_doc/seggpt
|
||||
title: SegGpt
|
||||
- local: model_doc/sam
|
||||
title: Segment Anything
|
||||
- local: model_doc/sam_hq
|
||||
title: Segment Anything High Quality
|
||||
- local: model_doc/superglue
|
||||
title: SuperGlue
|
||||
- local: model_doc/superpoint
|
||||
@ -933,6 +939,8 @@
|
||||
title: MusicGen
|
||||
- local: model_doc/musicgen_melody
|
||||
title: MusicGen Melody
|
||||
- local: model_doc/parakeet
|
||||
title: Parakeet
|
||||
- local: model_doc/pop2piano
|
||||
title: Pop2Piano
|
||||
- local: model_doc/seamless_m4t
|
||||
@ -977,6 +985,8 @@
|
||||
title: XLSR-Wav2Vec2
|
||||
title: Audio models
|
||||
- sections:
|
||||
- local: model_doc/sam2_video
|
||||
title: SAM2 Video
|
||||
- local: model_doc/timesformer
|
||||
title: TimeSformer
|
||||
- local: model_doc/vjepa2
|
||||
@ -1021,10 +1031,18 @@
|
||||
title: ColQwen2
|
||||
- local: model_doc/data2vec
|
||||
title: Data2Vec
|
||||
- local: model_doc/deepseek_vl
|
||||
title: DeepseekVL
|
||||
- local: model_doc/deepseek_vl_hybrid
|
||||
title: DeepseekVLHybrid
|
||||
- local: model_doc/deplot
|
||||
title: DePlot
|
||||
- local: model_doc/donut
|
||||
title: Donut
|
||||
- local: model_doc/edgetam
|
||||
title: EdgeTAM
|
||||
- local: model_doc/edgetam_video
|
||||
title: EdgeTamVideo
|
||||
- local: model_doc/emu3
|
||||
title: Emu3
|
||||
- local: model_doc/evolla
|
||||
@ -1077,6 +1095,8 @@
|
||||
title: LayoutLMV3
|
||||
- local: model_doc/layoutxlm
|
||||
title: LayoutXLM
|
||||
- local: model_doc/lfm2_vl
|
||||
title: LFM2-VL
|
||||
- local: model_doc/lilt
|
||||
title: LiLT
|
||||
- local: model_doc/llama4
|
||||
@ -1135,18 +1155,12 @@
|
||||
title: Qwen2Audio
|
||||
- local: model_doc/qwen2_vl
|
||||
title: Qwen2VL
|
||||
- local: model_doc/qwen3_omni_moe
|
||||
title: Qwen3-Omni-MoE
|
||||
- local: model_doc/qwen3_vl
|
||||
title: Qwen3VL
|
||||
- local: model_doc/qwen3_vl_moe
|
||||
title: Qwen3VLMoe
|
||||
- local: model_doc/sam2
|
||||
title: SAM2
|
||||
- local: model_doc/sam2_video
|
||||
title: SAM2 Video
|
||||
- local: model_doc/sam
|
||||
title: Segment Anything
|
||||
- local: model_doc/sam_hq
|
||||
title: Segment Anything High Quality
|
||||
- local: model_doc/shieldgemma2
|
||||
title: ShieldGemma2
|
||||
- local: model_doc/siglip
|
||||
|
@ -69,7 +69,6 @@ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
||||
Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
|
||||
To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
|
||||
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
||||
```
|
||||
@ -108,7 +107,6 @@ To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
|
||||
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
||||
```
|
||||
|
||||
|
||||
You can also control the order of Intel XPUs with:
|
||||
|
||||
```bash
|
||||
@ -120,7 +118,5 @@ For more information about device enumeration and sorting on Intel XPU, please r
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
|
||||
> [!WARNING]
|
||||
> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
|
||||
|
@ -193,4 +193,4 @@ def custom_attention_mask(
|
||||
|
||||
It mostly works thanks to the `mask_function`, which is a `Callable` in the form of [torch's mask_mod functions](https://pytorch.org/blog/flexattention/), taking 4 indices as input and returning a boolean to indicate if this position should take part in the attention computation.
|
||||
|
||||
If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py).
|
||||
If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py).
|
||||
|
@ -145,7 +145,6 @@ Arguments can also be passed directly to `@auto_docstring` for more control. Use
|
||||
|
||||
The `Returns` and `Examples` parts of the docstring can also be manually specified.
|
||||
|
||||
|
||||
```python
|
||||
MODEL_COMMON_CUSTOM_ARGS = r"""
|
||||
common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
|
||||
@ -202,7 +201,6 @@ There are some rules for documenting different types of arguments and they're li
|
||||
|
||||
If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
|
||||
|
||||
|
||||
- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
|
||||
|
||||
```py
|
||||
@ -212,9 +210,9 @@ There are some rules for documenting different types of arguments and they're li
|
||||
This can span multiple lines.
|
||||
```
|
||||
|
||||
* Include `type` in backticks.
|
||||
* Add *optional* if the argument is not required or has a default value.
|
||||
* Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
|
||||
* Include `type` in backticks.
|
||||
* Add *optional* if the argument is not required or has a default value.
|
||||
* Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
|
||||
|
||||
These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
||||
|
||||
|
@ -59,11 +59,9 @@ Refer to the table below to compare how caching improves efficiency.
|
||||
|
||||
| without caching | with caching |
|
||||
|---|---|
|
||||
| for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V`
|
||||
| for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V`
|
||||
| attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) |
|
||||
|
||||
|
||||
|
||||
## Cache class
|
||||
|
||||
A basic KV cache interface takes a key and value tensor for the current token and returns the updated `K` and `V` tensors. This is internally managed by a model's `forward` method.
|
||||
@ -138,12 +136,11 @@ The cache position tracks where to insert new tokens in the attention cache. It
|
||||
|
||||
Cache position is used internally for two purposes:
|
||||
|
||||
1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
|
||||
1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven't been cached yet are passed to the model's `forward`.
|
||||
2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, that pre-allocates a specific cache length.
|
||||
|
||||
The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
|
||||
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infer_device
|
||||
@ -160,12 +157,12 @@ generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## Legacy cache format
|
||||
|
||||
Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
|
||||
|
||||
The legacy format is essentially the same data structure but organized differently.
|
||||
|
||||
- It's a tuple of tuples, where each inner tuple contains the key and value tensors for a layer.
|
||||
- The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
|
||||
- The format is less flexible and doesn't support features like quantization or offloading.
|
||||
|
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Tool use
|
||||
|
||||
Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to it internally.
|
||||
Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to perform the computation internally.
|
||||
|
||||
This guide will demonstrate how to define tools, how to pass them to a chat model, and how to handle the model's output when it calls a tool.
|
||||
|
||||
@ -29,12 +29,11 @@ the arguments, argument types, and function docstring are parsed in order to gen
|
||||
Although passing Python functions is very convenient, the parser can only handle [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings)
|
||||
docstrings. Refer to the examples below for how to format a tool-ready function.
|
||||
|
||||
|
||||
```py
|
||||
def get_current_temperature(location: str, unit: str):
|
||||
"""
|
||||
Get the current temperature at a location.
|
||||
|
||||
|
||||
Args:
|
||||
location: The location to get the temperature for, in the format "City, Country"
|
||||
unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
|
||||
@ -44,7 +43,7 @@ def get_current_temperature(location: str, unit: str):
|
||||
def get_current_wind_speed(location: str):
|
||||
"""
|
||||
Get the current wind speed in km/h at a given location.
|
||||
|
||||
|
||||
Args:
|
||||
location: The location to get the wind speed for, in the format "City, Country"
|
||||
"""
|
||||
@ -103,7 +102,6 @@ Hold the call in the `tool_calls` key of an `assistant` message. This is the rec
|
||||
> [!WARNING]
|
||||
> Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
|
||||
|
||||
|
||||
```py
|
||||
tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
|
||||
messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
|
||||
@ -131,7 +129,6 @@ The temperature in Paris, France right now is 22°C.<|im_end|>
|
||||
> Although the key in the assistant message is called `tool_calls`, in most cases, models only emit a single tool call at a time. Some older models emit multiple tool calls at the same time, but this is a
|
||||
> significantly more complex process, as you need to handle multiple tool responses at once and disambiguate them, often using tool call IDs. Please refer to the model card to see exactly what format a model expects for tool calls.
|
||||
|
||||
|
||||
## JSON schemas
|
||||
|
||||
Another way to define tools is by passing a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
|
||||
@ -147,7 +144,7 @@ from transformers.utils import get_json_schema
|
||||
def multiply(a: float, b: float):
|
||||
"""
|
||||
A function that multiplies two numbers
|
||||
|
||||
|
||||
Args:
|
||||
a: The first number to multiply
|
||||
b: The second number to multiply
|
||||
@ -160,22 +157,22 @@ print(schema)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "function",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "multiply",
|
||||
"description": "A function that multiplies two numbers",
|
||||
"name": "multiply",
|
||||
"description": "A function that multiplies two numbers",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"a": {
|
||||
"type": "number",
|
||||
"type": "number",
|
||||
"description": "The first number to multiply"
|
||||
},
|
||||
},
|
||||
"b": {
|
||||
"type": "number",
|
||||
"description": "The second number to multiply"
|
||||
}
|
||||
},
|
||||
},
|
||||
"required": ["a", "b"]
|
||||
}
|
||||
}
|
||||
@ -187,7 +184,7 @@ We won't go into the details of JSON schema itself here, since it's already [ver
|
||||
```py
|
||||
# A simple function that takes no arguments
|
||||
current_time = {
|
||||
"type": "function",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "current_time",
|
||||
"description": "Get the current local time as a string.",
|
||||
@ -203,18 +200,18 @@ multiply = {
|
||||
'type': 'function',
|
||||
'function': {
|
||||
'name': 'multiply',
|
||||
'description': 'A function that multiplies two numbers',
|
||||
'description': 'A function that multiplies two numbers',
|
||||
'parameters': {
|
||||
'type': 'object',
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'a': {
|
||||
'type': 'number',
|
||||
'description': 'The first number to multiply'
|
||||
},
|
||||
},
|
||||
'b': {
|
||||
'type': 'number', 'description': 'The second number to multiply'
|
||||
}
|
||||
},
|
||||
},
|
||||
'required': ['a', 'b']
|
||||
}
|
||||
}
|
||||
@ -224,4 +221,4 @@ model_input = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tools = [current_time, multiply]
|
||||
)
|
||||
```
|
||||
```
|
||||
|
@ -16,13 +16,13 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Chat templates
|
||||
|
||||
The [chat basics](./conversations) guide covers how to store chat histories and generate text from chat models using [`TextGenerationPipeline`].
|
||||
The [chat basics](./conversations) guide covers how to store chat histories and generate text from chat models using [`TextGenerationPipeline`].
|
||||
|
||||
This guide is intended for more advanced users, and covers the underlying classes and methods, as well as the key concepts for understanding what's actually going on when you chat with a model.
|
||||
|
||||
The critical insight needed to understand chat models is this: All causal LMs, whether chat-trained or not, continue a sequence of tokens. When causal LMs are trained, the training usually begins with "pre-training" on a huge corpus of text, which creates a "base" model.
|
||||
These base models are then often "fine-tuned" for chat, which means training them on data that is formatted as a sequence of messages. The chat is still just a sequence of tokens, though! The list of `role` and `content` dictionaries that you pass
|
||||
to a chat model get converted to a token sequence, often with control tokens like `<|user|>` or `<|assistant|>` or `<|end_of_message|>`, which allow the model to see the chat structure.
|
||||
to a chat model get converted to a token sequence, often with control tokens like `<|user|>` or `<|assistant|>` or `<|end_of_message|>`, which allow the model to see the chat structure.
|
||||
There are many possible chat formats, and different models may use different formats or control tokens, even if they were fine-tuned from the same base model!
|
||||
|
||||
Don't panic, though - you don't need to memorize every possible chat format in order to use chat models. Chat models come with **chat templates**, which indicate how they expect chats to be formatted.
|
||||
@ -43,6 +43,7 @@ chat = [
|
||||
|
||||
tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
```
|
||||
|
||||
```md
|
||||
<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]
|
||||
```
|
||||
@ -62,6 +63,7 @@ chat = [
|
||||
|
||||
tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
```
|
||||
|
||||
```md
|
||||
<|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n
|
||||
```
|
||||
@ -75,9 +77,9 @@ Mistral-7B-Instruct uses `[INST]` and `[/INST]` tokens to indicate the start and
|
||||
|
||||
The input to `apply_chat_template` should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker, and the `content` key contains the message. The common roles are:
|
||||
|
||||
- `user` for messages from the user
|
||||
- `assistant` for messages from the model
|
||||
- `system` for directives on how the model should act (usually placed at the beginning of the chat)
|
||||
- `user` for messages from the user
|
||||
- `assistant` for messages from the model
|
||||
- `system` for directives on how the model should act (usually placed at the beginning of the chat)
|
||||
|
||||
[`apply_chat_template`] takes this list and returns a formatted sequence. Set `tokenize=True` if you want to tokenize the sequence.
|
||||
|
||||
@ -110,6 +112,7 @@ Pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
|
||||
outputs = model.generate(tokenized_chat, max_new_tokens=128)
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
```
|
||||
|
||||
```md
|
||||
<|system|>
|
||||
You are a friendly chatbot who always responds in the style of a pirate</s>
|
||||
@ -121,13 +124,13 @@ Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopte
|
||||
|
||||
> [!WARNING]
|
||||
> Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` if you tokenize later to avoid duplicating these tokens.
|
||||
> This isn’t an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option!
|
||||
> This isn't an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option!
|
||||
|
||||
### add_generation_prompt
|
||||
|
||||
You may have noticed the [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) argument in the above examples.
|
||||
You may have noticed the [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) argument in the above examples.
|
||||
This argument adds tokens to the end of the chat that indicate the start of an `assistant` response. Remember: Beneath all the chat abstractions, chat models are still just language models that continue a sequence of tokens!
|
||||
If you include tokens that tell it that it's now in an `assistant` response, it will correctly write a response, but if you don't include these tokens, the model may get confused and do something strange, like **continuing** the user's message instead of replying to it!
|
||||
If you include tokens that tell it that it's now in an `assistant` response, it will correctly write a response, but if you don't include these tokens, the model may get confused and do something strange, like **continuing** the user's message instead of replying to it!
|
||||
|
||||
Let's see an example to understand what `add_generation_prompt` is actually doing. First, let's format a chat without `add_generation_prompt`:
|
||||
|
||||
@ -135,6 +138,7 @@ Let's see an example to understand what `add_generation_prompt` is actually doin
|
||||
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
||||
tokenized_chat
|
||||
```
|
||||
|
||||
```md
|
||||
<|im_start|>user
|
||||
Hi there!<|im_end|>
|
||||
@ -150,6 +154,7 @@ Now, let's format the same chat with `add_generation_prompt=True`:
|
||||
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
tokenized_chat
|
||||
```
|
||||
|
||||
```md
|
||||
<|im_start|>user
|
||||
Hi there!<|im_end|>
|
||||
@ -163,7 +168,7 @@ Can I ask a question?<|im_end|>
|
||||
|
||||
When `add_generation_prompt=True`, `<|im_start|>assistant` is added at the end to indicate the start of an `assistant` message. This lets the model know an `assistant` response is next.
|
||||
|
||||
Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
|
||||
Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don't have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
|
||||
|
||||
### continue_final_message
|
||||
|
||||
@ -182,14 +187,13 @@ model.generate(**formatted_chat)
|
||||
```
|
||||
|
||||
> [!WARNING]
|
||||
> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
|
||||
|
||||
[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline.
|
||||
> You shouldn't use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
|
||||
|
||||
[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don't support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline.
|
||||
|
||||
## Model training
|
||||
|
||||
Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
|
||||
Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren't helpful during training.
|
||||
|
||||
An example of preprocessing a dataset with a chat template is shown below.
|
||||
|
||||
@ -212,6 +216,7 @@ dataset = Dataset.from_dict({"chat": [chat1, chat2]})
|
||||
dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
|
||||
print(dataset['formatted_chat'][0])
|
||||
```
|
||||
|
||||
```md
|
||||
<|user|>
|
||||
Which is bigger, the moon or the sun?</s>
|
||||
|
@ -18,8 +18,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
Multimodal chat models accept inputs like images, audio or video, in addition to text. The `content` key in a multimodal chat history is a list containing multiple items of different types. This is unlike text-only chat models whose `content` key is a single string.
|
||||
|
||||
|
||||
In the same way the [Tokenizer](./fast_tokenizer) class handles chat templates and tokenization for text-only models,
|
||||
In the same way the [Tokenizer](./fast_tokenizer) class handles chat templates and tokenization for text-only models,
|
||||
the [Processor](./processors) class handles preprocessing, tokenization and chat templates for multimodal models. Their [`~ProcessorMixin.apply_chat_template`] methods are almost identical.
|
||||
|
||||
This guide will show you how to chat with multimodal models with the high-level [`ImageTextToTextPipeline`] and at a lower level using the [`~ProcessorMixin.apply_chat_template`] and [`~GenerationMixin.generate`] methods.
|
||||
@ -46,7 +45,7 @@ messages = [
|
||||
]
|
||||
```
|
||||
|
||||
Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed.
|
||||
Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed.
|
||||
|
||||
```python
|
||||
import torch
|
||||
@ -57,8 +56,7 @@ out = pipe(text=messages, max_new_tokens=128)
|
||||
print(out[0]['generated_text'][-1]['content'])
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
```text
|
||||
Ahoy, me hearty! These be two feline friends, likely some tabby cats, taking a siesta on a cozy pink blanket. They're resting near remote controls, perhaps after watching some TV or just enjoying some quiet time together. Cats sure know how to find comfort and relaxation, don't they?
|
||||
```
|
||||
|
||||
@ -66,10 +64,9 @@ Aside from the gradual descent from pirate-speak into modern American English (i
|
||||
|
||||
## Using `apply_chat_template`
|
||||
|
||||
Like [text-only models](./chat_templating), use the [`~ProcessorMixin.apply_chat_template`] method to prepare the chat messages for multimodal models.
|
||||
Like [text-only models](./chat_templating), use the [`~ProcessorMixin.apply_chat_template`] method to prepare the chat messages for multimodal models.
|
||||
This method handles the tokenization and formatting of the chat messages, including images and other media types. The resulting inputs are passed to the model for generation.
|
||||
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
|
||||
@ -99,8 +96,7 @@ processed_chat = processor.apply_chat_template(messages, add_generation_prompt=T
|
||||
print(list(processed_chat.keys()))
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
```text
|
||||
['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw']
|
||||
```
|
||||
|
||||
@ -113,14 +109,13 @@ print(processor.decode(out[0]))
|
||||
|
||||
The decoded output contains the full conversation so far, including the user message and the placeholder tokens that contain the image information. You may need to trim the previous conversation from the output before displaying it to the user.
|
||||
|
||||
|
||||
## Video inputs
|
||||
|
||||
Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
|
||||
|
||||
- The content `"type"` should be `"video"` to indicate the content is a video.
|
||||
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
||||
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
|
||||
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you've already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
|
||||
|
||||
> [!WARNING]
|
||||
> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
|
||||
@ -148,6 +143,7 @@ messages = [
|
||||
```
|
||||
|
||||
### Example: Passing decoded video objects
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
@ -167,7 +163,9 @@ messages = [
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages.
|
||||
|
||||
```python
|
||||
|
||||
# Make sure a video backend library (pyav, decord, or torchvision) is available.
|
||||
@ -200,7 +198,6 @@ Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input
|
||||
|
||||
The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling.
|
||||
|
||||
|
||||
```python
|
||||
processed_chat = processor.apply_chat_template(
|
||||
messages,
|
||||
@ -265,4 +262,3 @@ print(processed_chat.keys())
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templates/) template stored in the tokenizer's [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax.
|
||||
|
||||
|
||||
```jinja
|
||||
{%- for message in messages %}
|
||||
{{- '<|' + message['role'] + |>\n' }}
|
||||
@ -30,8 +29,8 @@ A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templat
|
||||
```
|
||||
|
||||
If you stare at this for a while, you should realize that this is actually very like Python, albeit with some strange
|
||||
`{%-` syntax. The template iterates over a list of messages, and for each message, it prints the role and content of
|
||||
the message, followed by an end-of-sequence token. If `add_generation_prompt=True`, it adds
|
||||
`{%-` syntax. The template iterates over a list of messages, and for each message, it prints the role and content of
|
||||
the message, followed by an end-of-sequence token. If `add_generation_prompt=True`, it adds
|
||||
the starting header for an assistant message to the end of the conversation.
|
||||
|
||||
Load the written template as a string and assign it to the tokenizer's `chat_template` attribute. Once set, the template is used whenever you call [`~PreTrainedTokenizerBase.apply_chat_template`]. It is also saved
|
||||
@ -42,7 +41,7 @@ edit this file directly to change the template, which is often easier than manip
|
||||
|
||||
The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see the template it's using. Try starting with simple models that don't call any tools or support RAG because tool-use models can have very complex templates. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/stable/templates/#synopsis) for more details about formatting and syntax.
|
||||
|
||||
There are some specific tips and pitfalls you may encounter while writing chat templates specifically, though, and this section will cover some of them in more detail.
|
||||
There are some specific tips and pitfalls you may encounter while writing chat templates specifically, though, and this section will cover some of them in more detail.
|
||||
|
||||
### Writing multimodal chat templates
|
||||
|
||||
@ -108,7 +107,6 @@ We strongly recommend using `-` to ensure only the intended content is printed.
|
||||
|
||||
### Special variables and callables
|
||||
|
||||
|
||||
The only constants in a template are the `messages` variable and the `add_generation_prompt` boolean. However, you have
|
||||
access to **any other keyword arguments that are passed** to the [`~PreTrainedTokenizerBase.apply_chat_template`] method.
|
||||
|
||||
@ -133,7 +131,7 @@ Make the changes below to ensure compatibility across all Jinja implementations.
|
||||
|
||||
### Big templates
|
||||
|
||||
Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
|
||||
Newer models or models with features like [tool-calling](./chat_extras) and RAG require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
|
||||
|
||||
Write the template in a separate file and extract it to the chat template.
|
||||
|
||||
@ -166,22 +164,22 @@ The example below shows how a tool is defined in JSON schema format.
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "function",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "multiply",
|
||||
"description": "A function that multiplies two numbers",
|
||||
"name": "multiply",
|
||||
"description": "A function that multiplies two numbers",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"a": {
|
||||
"type": "number",
|
||||
"type": "number",
|
||||
"description": "The first number to multiply"
|
||||
},
|
||||
},
|
||||
"b": {
|
||||
"type": "number",
|
||||
"description": "The second number to multiply"
|
||||
}
|
||||
},
|
||||
},
|
||||
"required": ["a", "b"]
|
||||
}
|
||||
}
|
||||
@ -190,7 +188,7 @@ The example below shows how a tool is defined in JSON schema format.
|
||||
|
||||
An example of handling tool definitions in a chat template is shown below. The specific tokens and layouts should be changed to match the ones the model was trained with.
|
||||
|
||||
```
|
||||
```jinja
|
||||
{%- if tools %}
|
||||
{%- for tool in tools %}
|
||||
{{- '<tool>' + tool['function']['name'] + '\n' }}
|
||||
@ -228,7 +226,7 @@ Tool calls are generally passed in the `tool_calls` key of an `"assistant”` me
|
||||
|
||||
A common pattern for handling tool calls is shown below. You can use this as a starting point, but make sure you template actually matches the format the model was trained with!
|
||||
|
||||
```
|
||||
```jinja
|
||||
{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
|
||||
{%- for tool_call in message['tool_calls'] %}
|
||||
{{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
|
||||
@ -251,7 +249,7 @@ Tool responses are message dicts with the `tool` role. They are much simpler tha
|
||||
|
||||
Some templates may not even need the `name` key, in which case, you can write your template to only read the `content` key.
|
||||
|
||||
```
|
||||
```jinja
|
||||
{%- if message['role'] == 'tool' %}
|
||||
{{- "<tool_result>" + message['content'] + "</tool_result>" }}
|
||||
{%- endif %}
|
||||
|
@ -48,7 +48,6 @@ transformers chat -h
|
||||
|
||||
The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).
|
||||
|
||||
|
||||
## TextGenerationPipeline
|
||||
|
||||
[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
|
||||
@ -109,7 +108,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
|
||||
```
|
||||
|
||||
In general, model size and performance are directly correlated. Larger models are slower in addition to requiring more memory because each active parameter must be read from memory for every generated token.
|
||||
In general, model size and performance are directly correlated. Larger models are slower in addition to requiring more memory because each active parameter must be read from memory for every generated token.
|
||||
This is a bottleneck for LLM text generation and the main options for improving generation speed are to either quantize a model or use hardware with higher memory bandwidth. Adding more compute power doesn't meaningfully help.
|
||||
|
||||
You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token at a time. This significantly alleviates the bandwidth bottleneck and improves generation speed.
|
||||
|
@ -21,9 +21,10 @@ where `port` is the port used by `transformers serve` (`8000` by default). On th
|
||||
</h3>
|
||||
|
||||
You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
|
||||
|
||||
1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
|
||||
2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
|
||||
3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
|
||||
3. Add some random text to OpenAI API Key. This field won't be used, but it can't be empty;
|
||||
4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
|
||||
5. Hit "Verify".
|
||||
|
||||
@ -38,5 +39,3 @@ You are now ready to use your local model in Cursor! For instance, if you toggle
|
||||
<h3 align="center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
|
||||
</h3>
|
||||
|
||||
|
||||
|
@ -35,7 +35,7 @@ pip install deepspeed
|
||||
|
||||
PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere.
|
||||
|
||||
The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.
|
||||
The exact location can vary from system to system, but `/usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.
|
||||
|
||||
```bash
|
||||
which nvcc
|
||||
@ -45,7 +45,7 @@ which nvcc
|
||||
|
||||
You may also have more than one CUDA toolkit installed on your system.
|
||||
|
||||
```bash
|
||||
```text
|
||||
/usr/local/cuda-10.2
|
||||
/usr/local/cuda-11.0
|
||||
```
|
||||
|
@ -294,7 +294,7 @@ Consider running a [benchmark](https://github.com/microsoft/DeepSpeed/issues/998
|
||||
|
||||
The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to `auto`, but you can also manually set configure these values.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
@ -383,7 +383,7 @@ Gradient checkpointing saves memory by only storing *some* of the intermediate a
|
||||
|
||||
The batch size can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` and `train_batch_size` to the value of `world_size * per_device_train_batch_size * gradient_accumulation_steps`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"train_batch_size": "auto"
|
||||
@ -400,7 +400,7 @@ Reduce operations are lossy, for example, when gradients are averaged across mul
|
||||
|
||||
Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"communication_data_type": "fp32"
|
||||
}
|
||||
@ -412,7 +412,7 @@ Gradient accumulation accumulates gradients over several mini-batches of data be
|
||||
|
||||
Gradient accumulation can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `gradient_accumulation_steps`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"gradient_accumulation_steps": "auto"
|
||||
}
|
||||
@ -424,7 +424,7 @@ Gradient clipping is useful for preventing exploding gradients which can lead to
|
||||
|
||||
Gradient clipping can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `max_grad_norm`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"gradient_clipping": "auto"
|
||||
}
|
||||
@ -439,7 +439,7 @@ Mixed precision accelerates training speed by performing some calculations in ha
|
||||
|
||||
Train in fp32 if a model wasn't pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": false
|
||||
@ -454,7 +454,7 @@ For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/
|
||||
|
||||
To configure AMP-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically enables or disables fp16 based on the value of `fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
@ -471,7 +471,7 @@ For additional DeepSpeed fp16 training options, take a look at the [FP16 Trainin
|
||||
|
||||
To configure Apex-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configures `amp` based on the values of `fp16_backend` and `fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"amp": {
|
||||
"enabled": "auto",
|
||||
@ -486,11 +486,11 @@ To configure Apex-like fp16 mixed precision, set up the config as shown below wi
|
||||
> [!TIP]
|
||||
> bf16 requires DeepSpeed 0.6.0.
|
||||
|
||||
bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation.
|
||||
bf16 has the same dynamic range as fp32, and doesn't require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation.
|
||||
|
||||
bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
@ -514,7 +514,7 @@ DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/
|
||||
|
||||
You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
@ -530,7 +530,7 @@ You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
Use an unsupported optimizer by adding the following to the top level configuration.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"zero_allow_untested_optimizer": true
|
||||
}
|
||||
@ -538,7 +538,7 @@ Use an unsupported optimizer by adding the following to the top level configurat
|
||||
|
||||
From DeepSpeed 0.8.3+, if you want to use offload, you'll also need to add the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"zero_force_ds_cpu_optimizer": false
|
||||
}
|
||||
@ -558,7 +558,7 @@ If you don't configure the scheduler in the config file, [`Trainer`] automatical
|
||||
|
||||
You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
@ -581,7 +581,7 @@ You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
Resume training with a Universal checkpoint by setting `load_universal` to `true` in the config file.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"checkpoint": {
|
||||
"load_universal": true
|
||||
@ -640,7 +640,7 @@ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
|
||||
|
||||
A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a [checkpoint](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"checkpoint": {
|
||||
"use_node_local_storage": true
|
||||
@ -824,7 +824,7 @@ ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3,
|
||||
|
||||
If you don't, [`Trainer`] won't save the weights in fp16 and won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
@ -986,7 +986,7 @@ NaN loss often occurs when a model is pretrained in bf16 and you try to use it w
|
||||
|
||||
It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
|
@ -226,7 +226,7 @@ tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
|
||||
|
||||
<Youtube id="Yffk5aydLzg"/>
|
||||
|
||||
A Transformers model expects the input to be a PyTorch or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
|
||||
A Transformers model expects the input to be a PyTorch or NumPy tensor. A tokenizer's job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
|
||||
|
||||
```py
|
||||
from transformers import AutoTokenizer
|
||||
|
@ -229,6 +229,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
## Custom generation methods
|
||||
|
||||
Custom generation methods enable specialized behavior such as:
|
||||
|
||||
- have the model continue thinking if it is uncertain;
|
||||
- roll back generation if the model gets stuck;
|
||||
- handle special tokens with custom logic;
|
||||
@ -289,7 +290,7 @@ print(tokenizer.batch_decode(gen_out)[0])
|
||||
|
||||
If the custom method has pinned Python requirements that your environment doesn't meet, you'll get an exception about missing requirements. For instance, [transformers-community/custom_generate_bad_requirements](https://huggingface.co/transformers-community/custom_generate_bad_requirements) has an impossible set of requirements defined in its `custom_generate/requirements.txt` file, and you'll see the error message below if you try to run it.
|
||||
|
||||
```
|
||||
```text
|
||||
ImportError: Missing requirements in your local environment for `transformers-community/custom_generate_bad_requirements`:
|
||||
foo (installed: None)
|
||||
bar==0.0.0 (installed: None)
|
||||
@ -301,6 +302,7 @@ Updating your Python requirements accordingly will remove this error message.
|
||||
### Creating a custom generation method
|
||||
|
||||
To create a new generation method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
|
||||
|
||||
1. The model you've designed your generation method with.
|
||||
2. `custom_generate/generate.py`, which contains all the logic for your custom generation method.
|
||||
3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method.
|
||||
@ -308,7 +310,7 @@ To create a new generation method, you need to create a new [**Model**](https://
|
||||
|
||||
After you've added all required files, your repository should look like this
|
||||
|
||||
```
|
||||
```text
|
||||
your_repo/
|
||||
├── README.md # include the 'custom_generate' tag
|
||||
├── config.json
|
||||
@ -377,6 +379,7 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar
|
||||
```
|
||||
|
||||
Follow the recommended practices below to ensure your custom generation method works as expected.
|
||||
|
||||
- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
|
||||
- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
|
||||
- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
|
||||
@ -389,7 +392,6 @@ from .utils import some_function
|
||||
|
||||
Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom generation method.
|
||||
|
||||
|
||||
#### requirements.txt
|
||||
|
||||
You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
|
||||
@ -400,7 +402,7 @@ The root level `README.md` in the model repository usually describes the model t
|
||||
|
||||
For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository!
|
||||
|
||||
```
|
||||
```text
|
||||
---
|
||||
library_name: transformers
|
||||
tags:
|
||||
@ -411,13 +413,14 @@ tags:
|
||||
```
|
||||
|
||||
Recommended practices:
|
||||
|
||||
- Document input and output differences in [`~GenerationMixin.generate`].
|
||||
- Add self-contained examples to enable quick experimentation.
|
||||
- Describe soft-requirements such as if the method only works well with a certain family of models.
|
||||
|
||||
### Reusing `generate`’s input preparation
|
||||
### Reusing `generate`'s input preparation
|
||||
|
||||
If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]’s full preparation pipeline while overriding only the decoding loop.
|
||||
If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]'s full preparation pipeline while overriding only the decoding loop.
|
||||
|
||||
```py
|
||||
def custom_loop(model, input_ids, attention_mask, logits_processor, stopping_criteria, generation_config, **model_kwargs):
|
||||
@ -438,11 +441,12 @@ output = model.generate(
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers’ built-in input preparation logic.
|
||||
> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers' built-in input preparation logic.
|
||||
|
||||
### Finding custom generation methods
|
||||
|
||||
You can find all custom generation methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`. In addition to the tag, we curate two collections of `custom_generate` methods:
|
||||
|
||||
- [Custom generation methods - Community](https://huggingface.co/collections/transformers-community/custom-generation-methods-community-6888fb1da0efbc592d3a8ab6) -- a collection of powerful methods contributed by the community;
|
||||
- [Custom generation methods - Tutorials](https://huggingface.co/collections/transformers-community/custom-generation-methods-tutorials-6823589657a94940ea02cfec) -- a collection of reference implementations for methods that previously were part of `transformers`, as well as tutorials for `custom_generate`.
|
||||
|
||||
|
@ -185,9 +185,9 @@ See the [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/
|
||||
|
||||
The model head refers to the last layer of a neural network that accepts the raw hidden states and projects them onto a different dimension. There is a different model head for each task. For example:
|
||||
|
||||
* [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`].
|
||||
* [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`].
|
||||
* [`Wav2Vec2ForCTC`] is a language modeling head with [CTC](#connectionist-temporal-classification-ctc) on top of the base [`Wav2Vec2Model`].
|
||||
* [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`].
|
||||
* [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`].
|
||||
* [`Wav2Vec2ForCTC`] is a language modeling head with [CTC](#connectionist-temporal-classification-ctc) on top of the base [`Wav2Vec2Model`].
|
||||
|
||||
## I
|
||||
|
||||
|
@ -149,4 +149,4 @@ Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_refer
|
||||
```py
|
||||
model.print_trainable_parameters()
|
||||
"trainable params: 589,824 || all params: 94,274,096 || trainable%: 0.6256"
|
||||
```
|
||||
```
|
||||
|
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
|
||||
</h3>
|
||||
|
||||
|
||||
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
|
||||
vision, audio, video, and multimodal model, for both inference and training.
|
||||
|
||||
@ -35,6 +34,10 @@ There are over 1M+ Transformers [model checkpoints](https://huggingface.co/model
|
||||
|
||||
Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away.
|
||||
|
||||
Explore the [Models Timeline](./models_timeline) to discover the latest text, vision, audio and multimodal model architectures in Transformers.
|
||||
|
||||
|
||||
|
||||
## Features
|
||||
|
||||
Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of the main features include:
|
||||
@ -61,4 +64,4 @@ Transformers is designed for developers and machine learning engineers and resea
|
||||
|
||||
## Learn
|
||||
|
||||
If you're new to Transformers or want to learn more about transformer models, we recommend starting with the [LLM course](https://huggingface.co/learn/llm-course/chapter1/1?fw=pt). This comprehensive course covers everything from the fundamentals of how transformer models work to practical applications across various tasks. You'll learn the complete workflow, from curating high-quality datasets to fine-tuning large language models and implementing reasoning capabilities. The course contains both theoretical and hands-on exercises to build a solid foundational knowledge of transformer models as you learn.
|
||||
If you're new to Transformers or want to learn more about transformer models, we recommend starting with the [LLM course](https://huggingface.co/learn/llm-course/chapter1/1?fw=pt). This comprehensive course covers everything from the fundamentals of how transformer models work to practical applications across various tasks. You'll learn the complete workflow, from curating high-quality datasets to fine-tuning large language models and implementing reasoning capabilities. The course contains both theoretical and hands-on exercises to build a solid foundational knowledge of transformer models as you learn.
|
||||
|
@ -20,7 +20,6 @@ This page lists all of Transformers general utility functions that are found in
|
||||
|
||||
Most of those are only useful if you are studying the general code in the library.
|
||||
|
||||
|
||||
## Enums and namedtuples
|
||||
|
||||
[[autodoc]] utils.ExplicitEnum
|
||||
|
@ -65,7 +65,6 @@ values. Here, for instance, it has two keys that are `sequences` and `scores`.
|
||||
|
||||
We document here all output types.
|
||||
|
||||
|
||||
[[autodoc]] generation.GenerateDecoderOnlyOutput
|
||||
|
||||
[[autodoc]] generation.GenerateEncoderDecoderOutput
|
||||
@ -74,13 +73,11 @@ We document here all output types.
|
||||
|
||||
[[autodoc]] generation.GenerateBeamEncoderDecoderOutput
|
||||
|
||||
|
||||
## LogitsProcessor
|
||||
|
||||
A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
|
||||
generation.
|
||||
|
||||
|
||||
[[autodoc]] AlternatingCodebooksLogitsProcessor
|
||||
- __call__
|
||||
|
||||
@ -174,8 +171,6 @@ generation.
|
||||
[[autodoc]] WatermarkLogitsProcessor
|
||||
- __call__
|
||||
|
||||
|
||||
|
||||
## StoppingCriteria
|
||||
|
||||
A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token). Please note that this is exclusively available to our PyTorch implementations.
|
||||
@ -300,7 +295,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
|
||||
- to_legacy_cache
|
||||
- from_legacy_cache
|
||||
|
||||
|
||||
## Watermark Utils
|
||||
|
||||
[[autodoc]] WatermarkingConfig
|
||||
|
@ -22,8 +22,8 @@ worked around. We don't want for all users of `transformers` to have to install
|
||||
we therefore mark those as soft dependencies rather than hard dependencies.
|
||||
|
||||
The transformers toolkit is not made to error-out on import of a model that has a specific dependency; instead, an
|
||||
object for which you are lacking a dependency will error-out when calling any method on it. As an example, if
|
||||
`torchvision` isn't installed, the fast image processors will not be available.
|
||||
object for which you are lacking a dependency will error-out when calling any method on it. As an example, if
|
||||
`torchvision` isn't installed, the fast image processors will not be available.
|
||||
|
||||
This object is still importable:
|
||||
|
||||
@ -60,7 +60,7 @@ PyTorch dependency
|
||||
|
||||
**Tokenizers**: All files starting with `tokenization_` and ending with `_fast` have an automatic `tokenizers` dependency
|
||||
|
||||
**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group;
|
||||
**Vision**: All files starting with `image_processing_` have an automatic dependency to the `vision` dependency group;
|
||||
at the time of writing, this only contains the `pillow` dependency.
|
||||
|
||||
**Vision + Torch + Torchvision**: All files starting with `image_processing_` and ending with `_fast` have an automatic
|
||||
@ -71,7 +71,7 @@ All of these automatic dependencies are added on top of the explicit dependencie
|
||||
### Explicit Object Dependencies
|
||||
|
||||
We add a method called `requires` that is used to explicitly specify the dependencies of a given object. As an
|
||||
example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these
|
||||
example, the `Trainer` class has two hard dependencies: `torch` and `accelerate`. Here is how we specify these
|
||||
required dependencies:
|
||||
|
||||
```python
|
||||
|
@ -21,10 +21,8 @@ provides for it.
|
||||
|
||||
Most of those are only useful if you are adding new models in the library.
|
||||
|
||||
|
||||
## Model addition debuggers
|
||||
|
||||
|
||||
### Model addition debugger - context manager for model adders
|
||||
|
||||
This context manager is a power user tool intended for model adders. It tracks all forward calls within a model forward
|
||||
@ -72,7 +70,6 @@ with model_addition_debugger_context(
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Reading results
|
||||
|
||||
The debugger generates two files from the forward call, both with the same base name, but ending either with
|
||||
@ -221,9 +218,9 @@ path reference to the associated `.safetensors` file. Each tensor is written to
|
||||
the state dictionary. File names are constructed using the `module_path` as a prefix with a few possible postfixes that
|
||||
are built recursively.
|
||||
|
||||
* Module inputs are denoted with the `_inputs` and outputs by `_outputs`.
|
||||
* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`.
|
||||
* `dict` instances will be postfixed with `_{key}`.
|
||||
* Module inputs are denoted with the `_inputs` and outputs by `_outputs`.
|
||||
* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`.
|
||||
* `dict` instances will be postfixed with `_{key}`.
|
||||
|
||||
### Comparing between implementations
|
||||
|
||||
@ -231,10 +228,8 @@ Once the forward passes of two models have been traced by the debugger, one can
|
||||
below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly
|
||||
identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong.
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
### Limitations and scope
|
||||
|
||||
This feature will only work for torch-based models, and would require more work and case-by-case approach for say
|
||||
@ -254,13 +249,14 @@ layers.
|
||||
|
||||
This small util is a power user tool intended for model adders and maintainers. It lists all test methods
|
||||
existing in `test_modeling_common.py`, inherited by all model tester classes, and scans the repository to measure
|
||||
how many tests are being skipped and for which models.
|
||||
how many tests are being skipped and for which models.
|
||||
|
||||
### Rationale
|
||||
|
||||
When porting models to transformers, tests fail as they should, and sometimes `test_modeling_common` feels irreconcilable with the peculiarities of our brand new model. But how can we be sure we're not breaking everything by adding a seemingly innocent skip?
|
||||
|
||||
This utility:
|
||||
|
||||
- scans all test_modeling_common methods
|
||||
- looks for times where a method is skipped
|
||||
- returns a summary json you can load as a DataFrame/inspect
|
||||
@ -269,8 +265,7 @@ This utility:
|
||||
|
||||

|
||||
|
||||
|
||||
### Usage
|
||||
### Usage
|
||||
|
||||
You can run the skipped test analyzer in two ways:
|
||||
|
||||
@ -286,7 +281,7 @@ python utils/scan_skipped_tests.py --output_dir path/to/output
|
||||
|
||||
**Example output:**
|
||||
|
||||
```
|
||||
```text
|
||||
🔬 Parsing 331 model test files once each...
|
||||
📝 Aggregating 224 tests...
|
||||
(224/224) test_update_candidate_strategy_with_matches_1es_3d_is_nonecodet_schedule_fa_kwargs
|
||||
|
@ -20,7 +20,6 @@ This page lists all the utility functions the library provides for pipelines.
|
||||
|
||||
Most of those are only useful if you are studying the code of the models in the library.
|
||||
|
||||
|
||||
## Argument handling
|
||||
|
||||
[[autodoc]] pipelines.ArgumentHandler
|
||||
|
@ -25,7 +25,7 @@ You are now ready to chat!
|
||||
|
||||
To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
|
||||
|
||||
```
|
||||
```bash
|
||||
ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
|
||||
```
|
||||
|
||||
|
@ -67,7 +67,7 @@ out = model.generate(**inputs, do_sample=False, max_new_tokens=20, past_key_valu
|
||||
|
||||
## Fixed-size cache
|
||||
|
||||
The default [`DynamicCache`] prevents you from taking advantage of most just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation.
|
||||
The default [`DynamicCache`] prevents you from taking advantage of most just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation.
|
||||
|
||||
A fixed-size cache ([`StaticCache`]) pre-allocates a specific maximum cache size for the kv pairs. You can generate up to the maximum cache size without needing to modify it. However, having a fixed (usually large) size for the key/value states means that while generating, a lot of tokens will actually be masked as they should not take part in the attention. So this trick allows to easily `compile` the decoding stage, but it incurs a waste of tokens in the attention computation. As all things, it's then a trade-off which should be very good if you generate with several sequence of more or less the same lengths, but may be sub-optimal if you have for example 1 very large sequence, and then only short sequences (as the fix cache size would be large, a lot would be wasted for the short sequences). Make sure you understand the impact if you use it!
|
||||
|
||||
@ -213,7 +213,7 @@ A cache can also work in iterative generation settings where there is back-and-f
|
||||
|
||||
For iterative generation with a cache, start by initializing an empty cache class and then you can feed in your new prompts. Keep track of dialogue history with a [chat template](./chat_templating).
|
||||
|
||||
The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you’re using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written.
|
||||
The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you're using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written.
|
||||
|
||||
For example, some models use special `<think> ... </think>` tokens during reasoning. These could get lost during re-encoding, causing indexing issues. You might need to manually remove or adjust extra tokens from the completions to keep things stable.
|
||||
|
||||
|
@ -35,6 +35,7 @@ Before you begin, it's helpful to install [bitsandbytes](https://hf.co/docs/bits
|
||||
```bash
|
||||
!pip install -U transformers bitsandbytes
|
||||
```
|
||||
|
||||
Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more.
|
||||
|
||||
Load a LLM with [`~PreTrainedModel.from_pretrained`] and add the following two parameters to reduce the memory requirements.
|
||||
@ -92,6 +93,7 @@ model.generate(**inputs, num_beams=4, do_sample=True)
|
||||
```
|
||||
|
||||
[`~GenerationMixin.generate`] can also be extended with external libraries or custom code:
|
||||
|
||||
1. the `logits_processor` parameter accepts custom [`LogitsProcessor`] instances for manipulating the next token probability distribution;
|
||||
2. the `stopping_criteria` parameters supports custom [`StoppingCriteria`] to stop text generation;
|
||||
3. other custom generation methods can be loaded through the `custom_generate` flag ([docs](generation_strategies.md/#custom-decoding-methods)).
|
||||
@ -154,7 +156,6 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
||||
| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
|
||||
| `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
|
||||
|
||||
|
||||
## Pitfalls
|
||||
|
||||
The section below covers some common issues you may encounter during text generation and how to solve them.
|
||||
|
@ -66,6 +66,7 @@ If you have access to an 8 x 80GB A100 node, you could load BLOOM as follows
|
||||
```bash
|
||||
!pip install transformers accelerate bitsandbytes optimum
|
||||
```
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
@ -98,7 +99,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
|
||||
```
|
||||
|
||||
@ -116,7 +118,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```bash
|
||||
|
||||
```text
|
||||
29.0260648727417
|
||||
```
|
||||
|
||||
@ -127,7 +130,6 @@ Note that if we had tried to run the model in full float32 precision, a whopping
|
||||
|
||||
If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under `"dtype"`, *e.g.* [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). It is recommended to set the model to the same precision type as written in the config when loading with `from_pretrained(..., dtype=...)` except when the original type is float32 in which case one can use both `float16` or `bfloat16` for inference.
|
||||
|
||||
|
||||
Let's define a `flush(...)` function to free all allocated memory so that we can accurately measure the peak allocated GPU memory.
|
||||
|
||||
```python
|
||||
@ -148,6 +150,7 @@ Let's call it now for the next experiment.
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
|
||||
From the Accelerate library, you can also use a device-agnostic utility method called [release_memory](https://github.com/huggingface/accelerate/blob/29be4788629b772a3b722076e433b5b3b5c85da3/src/accelerate/utils/memory.py#L63), which takes various hardware backends like XPU, MLU, NPU, MPS, and more into account.
|
||||
|
||||
```python
|
||||
@ -204,7 +207,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
|
||||
```
|
||||
|
||||
@ -215,15 +219,16 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
15.219234466552734
|
||||
```
|
||||
|
||||
Significantly less! We're down to just a bit over 15 GBs and could therefore run this model on consumer GPUs like the 4090.
|
||||
We're seeing a very nice gain in memory efficiency and more or less no degradation to the model's output. However, we can also notice a slight slow-down during inference.
|
||||
|
||||
|
||||
We delete the models and flush the memory again.
|
||||
|
||||
```python
|
||||
del model
|
||||
del pipe
|
||||
@ -245,7 +250,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```\ndef bytes_to_gigabytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single argument
|
||||
```
|
||||
|
||||
@ -256,7 +262,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
9.543574333190918
|
||||
```
|
||||
|
||||
@ -270,6 +277,7 @@ Also note that inference here was again a bit slower compared to 8-bit quantizat
|
||||
del model
|
||||
del pipe
|
||||
```
|
||||
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
@ -384,6 +392,7 @@ def alternating(list1, list2):
|
||||
-----
|
||||
"""
|
||||
```
|
||||
|
||||
For demonstration purposes, we duplicate the system prompt by ten so that the input length is long enough to observe Flash Attention's memory savings.
|
||||
We append the original text prompt `"Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"`
|
||||
|
||||
@ -413,7 +422,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Generated in 10.96854019165039 seconds.
|
||||
Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
|
||||
````
|
||||
@ -429,7 +439,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```bash
|
||||
|
||||
```text
|
||||
37.668193340301514
|
||||
```
|
||||
|
||||
@ -460,7 +471,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Generated in 3.0211617946624756 seconds.
|
||||
Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
|
||||
```
|
||||
@ -474,7 +486,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
32.617331981658936
|
||||
```
|
||||
|
||||
@ -604,7 +617,8 @@ generated_text
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
shape of input_ids torch.Size([1, 21])
|
||||
shape of input_ids torch.Size([1, 22])
|
||||
shape of input_ids torch.Size([1, 23])
|
||||
@ -641,7 +655,8 @@ generated_text
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
length of key-value cache 20
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
@ -675,7 +690,7 @@ Note that, despite our advice to use key-value caches, your LLM output may be sl
|
||||
|
||||
The key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. Let's look at an example.
|
||||
|
||||
```
|
||||
```text
|
||||
User: How many people live in France?
|
||||
Assistant: Roughly 75 million people live in France
|
||||
User: And how many are in Germany?
|
||||
@ -712,7 +727,8 @@ tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):]
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
is a modified version of the function that returns Mega bytes instead.
|
||||
|
||||
def bytes_to_megabytes(bytes):
|
||||
@ -733,7 +749,8 @@ config = model.config
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
7864320000
|
||||
```
|
||||
|
||||
@ -773,7 +790,6 @@ The most notable application of GQA is [Llama-v2](https://huggingface.co/meta-ll
|
||||
|
||||
> As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat.
|
||||
|
||||
|
||||
## Conclusion
|
||||
|
||||
The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. As an example, one such promising research direction is [speculative decoding](https://huggingface.co/papers/2211.17192) where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. Going into more detail is out of the scope of this notebook, but can be read upon in this [nice blog post](https://huggingface.co/blog/assisted-generation).
|
||||
|
@ -54,7 +54,6 @@ The main class that implements callbacks is [`TrainerCallback`]. It gets the
|
||||
Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
|
||||
[`TrainerControl`].
|
||||
|
||||
|
||||
## Available Callbacks
|
||||
|
||||
Here is the list of the available [`TrainerCallback`] in the library:
|
||||
|
@ -24,7 +24,6 @@ Each derived config class implements model specific attributes. Common attribute
|
||||
`hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
|
||||
`vocab_size`.
|
||||
|
||||
|
||||
## PretrainedConfig
|
||||
|
||||
[[autodoc]] PretrainedConfig
|
||||
|
@ -25,7 +25,6 @@ on the formed batch.
|
||||
|
||||
Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
|
||||
|
||||
|
||||
## Default data collator
|
||||
|
||||
[[autodoc]] data.data_collator.default_data_collator
|
||||
|
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# DeepSpeed
|
||||
|
||||
[DeepSpeed](https://github.com/deepspeedai/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you.
|
||||
[DeepSpeed](https://github.com/deepspeedai/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you.
|
||||
|
||||
However, if you want to use DeepSpeed without the [`Trainer`], Transformers provides a [`HfDeepSpeedConfig`] class.
|
||||
|
||||
|
@ -15,14 +15,12 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
|
||||
# ExecuTorch
|
||||
|
||||
[`ExecuTorch`](https://github.com/pytorch/executorch) is an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers. It is part of the PyTorch ecosystem and supports the deployment of PyTorch models with a focus on portability, productivity, and performance.
|
||||
|
||||
ExecuTorch introduces well defined entry points to perform model, device, and/or use-case specific optimizations such as backend delegation, user-defined compiler transformations, memory planning, and more. The first step in preparing a PyTorch model for execution on an edge device using ExecuTorch is to export the model. This is achieved through the use of a PyTorch API called [`torch.export`](https://pytorch.org/docs/stable/export.html).
|
||||
|
||||
|
||||
## ExecuTorch Integration
|
||||
|
||||
An integration point is being developed to ensure that 🤗 Transformers can be exported using `torch.export`. The goal of this integration is not only to enable export but also to ensure that the exported artifact can be further lowered and optimized to run efficiently in `ExecuTorch`, particularly for mobile and edge use cases.
|
||||
|
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction from sequences, e.g., pre-processing audio files to generate Log-Mel Spectrogram features, feature extraction from images, e.g., cropping image files, but also padding, normalization, and conversion to NumPy and PyTorch tensors.
|
||||
|
||||
|
||||
## FeatureExtractionMixin
|
||||
|
||||
[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
|
||||
|
@ -26,6 +26,7 @@ from transformers import AutoImageProcessor
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True)
|
||||
```
|
||||
|
||||
Note that `use_fast` will be set to `True` by default in a future release.
|
||||
|
||||
When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise.
|
||||
@ -57,7 +58,6 @@ Here are some speed comparisons between the base and fast image processors for t
|
||||
|
||||
These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU.
|
||||
|
||||
|
||||
## ImageProcessingMixin
|
||||
|
||||
[[autodoc]] image_processing_utils.ImageProcessingMixin
|
||||
@ -72,7 +72,6 @@ These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon
|
||||
|
||||
[[autodoc]] image_processing_utils.BaseImageProcessor
|
||||
|
||||
|
||||
## BaseImageProcessorFast
|
||||
|
||||
[[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
|
||||
|
@ -55,7 +55,6 @@ logger.info("INFO")
|
||||
logger.warning("WARN")
|
||||
```
|
||||
|
||||
|
||||
All the methods of this logging module are documented below, the main ones are
|
||||
[`logging.get_verbosity`] to get the current level of verbosity in the logger and
|
||||
[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
|
||||
@ -81,6 +80,7 @@ We use both in the `transformers` library. We leverage and adapt `logging`'s `ca
|
||||
management of these warning messages by the verbosity setters above.
|
||||
|
||||
What does that mean for developers of the library? We should respect the following heuristics:
|
||||
|
||||
- `warnings` should be favored for developers of the library and libraries dependent on `transformers`
|
||||
- `logging` should be used for end-users of the library using it in every-day projects
|
||||
|
||||
|
@ -26,7 +26,6 @@ file or directory, or from a pretrained model configuration provided by the libr
|
||||
|
||||
The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`] and [`~generation.GenerationMixin`].
|
||||
|
||||
|
||||
## PreTrainedModel
|
||||
|
||||
[[autodoc]] PreTrainedModel
|
||||
|
@ -51,4 +51,3 @@ to export models for different types of topologies or tasks.
|
||||
### FeaturesManager
|
||||
|
||||
[[autodoc]] onnx.features.FeaturesManager
|
||||
|
||||
|
@ -22,7 +22,6 @@ The `.optimization` module provides:
|
||||
- several schedules in the form of schedule objects that inherit from `_LRSchedule`:
|
||||
- a gradient accumulation class to accumulate the gradients of multiple batches
|
||||
|
||||
|
||||
## AdaFactor
|
||||
|
||||
[[autodoc]] Adafactor
|
||||
|
@ -47,7 +47,6 @@ However, this is not always the case. Some models apply normalization or subsequ
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
|
||||
will get `None`. Here for instance `outputs.loss` is the loss computed by the model, and `outputs.attentions` is
|
||||
`None`.
|
||||
|
@ -81,7 +81,6 @@ for out in tqdm(pipe(KeyDataset(dataset, "file"))):
|
||||
|
||||
For ease of use, a generator is also possible:
|
||||
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
@ -160,7 +159,7 @@ for batch_size in [1, 8, 64, 256]:
|
||||
pass
|
||||
```
|
||||
|
||||
```
|
||||
```text
|
||||
# On GTX 970
|
||||
------------------------------
|
||||
Streaming no batching
|
||||
@ -196,8 +195,7 @@ This is a occasional very long sentence compared to the other. In that case, the
|
||||
tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
|
||||
bigger batches, the program simply crashes.
|
||||
|
||||
|
||||
```
|
||||
```text
|
||||
------------------------------
|
||||
Streaming no batching
|
||||
100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
|
||||
@ -245,7 +243,6 @@ multiple forward pass of a model. Under normal circumstances, this would yield i
|
||||
In order to circumvent this issue, both of these pipelines are a bit specific, they are `ChunkPipeline` instead of
|
||||
regular `Pipeline`. In short:
|
||||
|
||||
|
||||
```python
|
||||
preprocessed = pipe.preprocess(inputs)
|
||||
model_outputs = pipe.forward(preprocessed)
|
||||
@ -254,7 +251,6 @@ outputs = pipe.postprocess(model_outputs)
|
||||
|
||||
Now becomes:
|
||||
|
||||
|
||||
```python
|
||||
all_model_outputs = []
|
||||
for preprocessed in pipe.preprocess(inputs):
|
||||
@ -282,7 +278,6 @@ If you want to override a specific pipeline.
|
||||
Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
|
||||
cases, so `transformers` could maybe support your use case.
|
||||
|
||||
|
||||
If you want to try simply you can:
|
||||
|
||||
- Subclass your pipeline of choice
|
||||
@ -302,7 +297,6 @@ my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
|
||||
|
||||
That should enable you to do all the custom code you want.
|
||||
|
||||
|
||||
## Implementing a pipeline
|
||||
|
||||
[Implementing a new pipeline](../add_new_pipeline)
|
||||
@ -329,7 +323,6 @@ Pipelines available for audio tasks include the following.
|
||||
- __call__
|
||||
- all
|
||||
|
||||
|
||||
### ZeroShotAudioClassificationPipeline
|
||||
|
||||
[[autodoc]] ZeroShotAudioClassificationPipeline
|
||||
|
@ -17,6 +17,7 @@ rendered properly in your Markdown viewer.
|
||||
# Processors
|
||||
|
||||
Processors can mean two different things in the Transformers library:
|
||||
|
||||
- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
|
||||
or [CLIP](../model_doc/clip) (text and vision)
|
||||
- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
|
||||
@ -71,7 +72,6 @@ Additionally, the following method can be used to load values from a data file a
|
||||
|
||||
[[autodoc]] data.processors.glue.glue_convert_examples_to_features
|
||||
|
||||
|
||||
## XNLI
|
||||
|
||||
[The Cross-Lingual NLI Corpus (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) is a benchmark that evaluates the
|
||||
@ -88,7 +88,6 @@ Please note that since the gold labels are available on the test set, evaluation
|
||||
|
||||
An example using these processors is given in the [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_xnli.py) script.
|
||||
|
||||
|
||||
## SQuAD
|
||||
|
||||
[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) is a benchmark that
|
||||
@ -115,11 +114,9 @@ Additionally, the following method can be used to convert SQuAD examples into
|
||||
|
||||
[[autodoc]] data.processors.squad.squad_convert_examples_to_features
|
||||
|
||||
|
||||
These processors as well as the aforementioned method can be used with files containing the data as well as with the
|
||||
*tensorflow_datasets* package. Examples are given below.
|
||||
|
||||
|
||||
### Example usage
|
||||
|
||||
Here is an example using the processors as well as the conversion method using data files:
|
||||
|
@ -30,15 +30,15 @@ like token streaming.
|
||||
## GenerationConfig
|
||||
|
||||
[[autodoc]] generation.GenerationConfig
|
||||
- from_pretrained
|
||||
- from_model_config
|
||||
- save_pretrained
|
||||
- update
|
||||
- validate
|
||||
- get_generation_mode
|
||||
- from_pretrained
|
||||
- from_model_config
|
||||
- save_pretrained
|
||||
- update
|
||||
- validate
|
||||
- get_generation_mode
|
||||
|
||||
## GenerationMixin
|
||||
|
||||
[[autodoc]] GenerationMixin
|
||||
- generate
|
||||
- compute_transition_scores
|
||||
- generate
|
||||
- compute_transition_scores
|
||||
|
@ -22,7 +22,7 @@ Rust library [🤗 Tokenizers](https://github.com/huggingface/tokenizers). The "
|
||||
|
||||
1. a significant speed-up in particular when doing batched tokenization and
|
||||
2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
|
||||
index of the token comprising a given character or the span of characters corresponding to a given token).
|
||||
index of the token comprising a given character or the span of characters corresponding to a given token).
|
||||
|
||||
The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]
|
||||
implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
|
||||
@ -50,12 +50,11 @@ several advanced alignment methods which can be used to map between the original
|
||||
token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
|
||||
to a given token).
|
||||
|
||||
|
||||
# Multimodal Tokenizer
|
||||
|
||||
Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens
|
||||
as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will
|
||||
be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder.
|
||||
be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder.
|
||||
|
||||
To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not
|
||||
have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
|
||||
|
@ -22,7 +22,6 @@ The video processor extends the functionality of image processors by allowing Vi
|
||||
|
||||
When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't updated your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
|
||||
|
||||
|
||||
### Usage Example
|
||||
Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
|
||||
|
||||
@ -59,7 +58,6 @@ The video processor can also sample video frames using the technique best suited
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
```python
|
||||
from transformers import AutoVideoProcessor
|
||||
|
||||
@ -92,4 +90,3 @@ print(processed_video_inputs.pixel_values_videos.shape)
|
||||
## BaseVideoProcessor
|
||||
|
||||
[[autodoc]] video_processing_utils.BaseVideoProcessor
|
||||
|
||||
|
@ -25,7 +25,6 @@ The abstract from the paper is the following:
|
||||
|
||||
*We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.*
|
||||
|
||||
|
||||
This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
|
||||
The original code can be found [here](https://github.com/apple/ml-aim).
|
||||
|
||||
|
@ -148,6 +148,7 @@ for label, score in zip(candidate_labels, probs):
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- Refer to the [Kakao Brain’s Open Source ViT, ALIGN, and the New COYO Text-Image Dataset](https://huggingface.co/blog/vit-align) blog post for more details.
|
||||
|
||||
## AlignConfig
|
||||
|
@ -102,4 +102,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
## ArceeForTokenClassification
|
||||
|
||||
[[autodoc]] ArceeForTokenClassification
|
||||
- forward
|
||||
- forward
|
||||
|
@ -98,7 +98,7 @@ print(response)
|
||||
</hfoptions>
|
||||
|
||||
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
|
||||
|
||||
|
||||
The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4 and the [rhymes-ai/Aria-sequential_mlp](https://huggingface.co/rhymes-ai/Aria-sequential_mlp) checkpoint. This checkpoint replaces grouped GEMM with `torch.nn.Linear` layers for easier quantization.
|
||||
|
||||
```py
|
||||
@ -142,7 +142,6 @@ response = processor.decode(output_ids, skip_special_tokens=True)
|
||||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## AriaImageProcessor
|
||||
|
||||
[[autodoc]] AriaImageProcessor
|
||||
|
@ -52,16 +52,16 @@ the authors compute the stats for a downstream dataset.
|
||||
|
||||
### Using Scaled Dot Product Attention (SDPA)
|
||||
|
||||
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
|
||||
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
|
||||
[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
|
||||
PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
|
||||
encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
|
||||
[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
|
||||
or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
|
||||
page for more information.
|
||||
|
||||
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
|
||||
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
|
||||
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
|
||||
|
||||
```
|
||||
```py
|
||||
from transformers import ASTForAudioClassification
|
||||
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", attn_implementation="sdpa", dtype=torch.float16)
|
||||
...
|
||||
|
@ -23,7 +23,6 @@ automatically retrieve the relevant model given the name/path to the pretrained
|
||||
Instantiating one of [`AutoConfig`], [`AutoModel`], and
|
||||
[`AutoTokenizer`] will directly create a class of the relevant architecture. For instance
|
||||
|
||||
|
||||
```python
|
||||
model = AutoModel.from_pretrained("google-bert/bert-base-cased")
|
||||
```
|
||||
|
@ -29,7 +29,7 @@ You can find all the original Aya Vision checkpoints under the [Aya Vision](http
|
||||
|
||||
> [!TIP]
|
||||
> This model was contributed by [saurabhdash](https://huggingface.co/saurabhdash) and [yonigozlan](https://huggingface.co/yonigozlan).
|
||||
>
|
||||
>
|
||||
> Click on the Aya Vision models in the right sidebar for more examples of how to apply Aya Vision to different image-to-text tasks.
|
||||
|
||||
The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
|
||||
|
@ -76,7 +76,7 @@ Note that 🤗 Optimum must be installed before using this feature. [Here's how
|
||||
|
||||
Flash Attention 2 is an even faster, optimized version of the previous optimization.
|
||||
|
||||
##### Installation
|
||||
##### Installation
|
||||
|
||||
First, check whether your hardware is compatible with Flash Attention 2. The latest list of compatible hardware can be found in the [official documentation](https://github.com/Dao-AILab/flash-attention#installation-and-features). If your hardware is not compatible with Flash Attention 2, you can still benefit from attention kernel optimisations through Better Transformer support covered [above](https://huggingface.co/docs/transformers/main/en/model_doc/bark#using-better-transformer).
|
||||
|
||||
@ -86,7 +86,6 @@ Next, [install](https://github.com/Dao-AILab/flash-attention#installation-and-fe
|
||||
pip install -U flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
|
||||
##### Usage
|
||||
|
||||
To load a model using Flash Attention 2, we can pass the `attn_implementation="flash_attention_2"` flag to [`.from_pretrained`](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). We'll also load the model in half-precision (e.g. `torch.float16`), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference:
|
||||
@ -97,7 +96,6 @@ model = BarkModel.from_pretrained("suno/bark-small", dtype=torch.float16, attn_i
|
||||
|
||||
##### Performance comparison
|
||||
|
||||
|
||||
The following diagram shows the latency for the native attention implementation (no optimisation) against Better Transformer and Flash Attention 2. In all cases, we generate 400 semantic tokens on a 40GB A100 GPU with PyTorch 2.1. Flash Attention 2 is also consistently faster than Better Transformer, and its performance improves even more as batch sizes increase:
|
||||
|
||||
<div style="text-align: center">
|
||||
@ -108,7 +106,6 @@ To put this into perspective, on an NVIDIA A100 and when generating 400 semantic
|
||||
|
||||
At batch size 8, on an NVIDIA A100, Flash Attention 2 is also 10% faster than Better Transformer, and at batch size 16, 25%.
|
||||
|
||||
|
||||
#### Combining optimization techniques
|
||||
|
||||
You can combine optimization techniques, and use CPU offload, half-precision and Flash Attention 2 (or 🤗 Better Transformer) all at once.
|
||||
@ -147,7 +144,7 @@ These presets are also uploaded in the hub [here](https://huggingface.co/suno/ba
|
||||
>>> audio_array = audio_array.cpu().numpy().squeeze()
|
||||
```
|
||||
|
||||
Bark can generate highly realistic, **multilingual** speech as well as other audio - including music, background noise and simple sound effects.
|
||||
Bark can generate highly realistic, **multilingual** speech as well as other audio - including music, background noise and simple sound effects.
|
||||
|
||||
```python
|
||||
>>> # Multilingual speech - simplified Chinese
|
||||
@ -165,7 +162,6 @@ Bark can generate highly realistic, **multilingual** speech as well as other aud
|
||||
|
||||
The model can also produce **nonverbal communications** like laughing, sighing and crying.
|
||||
|
||||
|
||||
```python
|
||||
>>> # Adding non-speech cues to the input text
|
||||
>>> inputs = processor("Hello uh ... [clears throat], my dog is cute [laughter]")
|
||||
@ -235,4 +231,3 @@ To save the audio, simply take the sample rate from the model config and some sc
|
||||
|
||||
[[autodoc]] BarkSemanticConfig
|
||||
- all
|
||||
|
||||
|
@ -15,7 +15,6 @@ rendered properly in your Markdown viewer.
|
||||
-->
|
||||
*This model was released on 2019-10-29 and added to Hugging Face Transformers on 2020-11-16.*
|
||||
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
@ -24,7 +23,7 @@ rendered properly in your Markdown viewer.
|
||||
</div>
|
||||
|
||||
# BART
|
||||
[BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It’s pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language.
|
||||
[BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It's pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language.
|
||||
|
||||
You can find all the original BART checkpoints under the [AI at Meta](https://huggingface.co/facebook?search_models=bart) organization.
|
||||
|
||||
@ -46,6 +45,7 @@ pipeline = pipeline(
|
||||
pipeline("Plants create <mask> through a process known as photosynthesis.")
|
||||
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -89,7 +89,7 @@ echo -e "Plants create <mask> through a process known as photosynthesis." | tran
|
||||
|
||||
- Inputs should be padded on the right because BERT uses absolute position embeddings.
|
||||
- The [facebook/bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn) checkpoint doesn't include `mask_token_id` which means it can't perform mask-filling tasks.
|
||||
- BART doesn’t use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or [`~PreTrainedTokenizerBase.encode`] to get the proper splitting.
|
||||
- BART doesn't use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or [`~PreTrainedTokenizerBase.encode`] to get the proper splitting.
|
||||
- The forward pass of [`BartModel`] creates the `decoder_input_ids` if they're not passed. This can be different from other model APIs, but it is a useful feature for mask-filling tasks.
|
||||
- Model predictions are intended to be identical to the original implementation when `forced_bos_token_id=0`. This only works if the text passed to `fairseq.encode` begins with a space.
|
||||
- [`~GenerationMixin.generate`] should be used for conditional generation tasks like summarization.
|
||||
|
@ -31,7 +31,6 @@ You can find all of the original BARThez checkpoints under the [BARThez](https:/
|
||||
> This model was contributed by [moussakam](https://huggingface.co/moussakam).
|
||||
> Refer to the [BART](./bart) docs for more usage examples.
|
||||
|
||||
|
||||
The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
|
||||
|
||||
<hfoptions id="usage">
|
||||
|
@ -33,12 +33,9 @@ You can find all the original checkpoints under the [VinAI](https://huggingface.
|
||||
|
||||
The example below demonstrates how to summarize text with [`Pipeline`] or the [`AutoModel`] class.
|
||||
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
@ -98,8 +95,6 @@ transformers run --task summarization --model vinai/bartpho-word --device 0
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- BARTpho uses the large architecture of BART with an additional layer-normalization layer on top of the encoder and decoder. The BART-specific classes should be replaced with the mBART-specific classes.
|
||||
|
@ -87,7 +87,7 @@ page for more information.
|
||||
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
|
||||
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
|
||||
|
||||
```
|
||||
```py
|
||||
from transformers import BeitForImageClassification
|
||||
model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", dtype=torch.float16)
|
||||
...
|
||||
@ -123,6 +123,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
- See also: [Image classification task guide](../tasks/image_classification)
|
||||
|
||||
**Semantic segmentation**
|
||||
|
||||
- [Semantic segmentation task guide](../tasks/semantic_segmentation)
|
||||
|
||||
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2019-07-29 and added to Hugging Face Transformers on 2020-11-16.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
@ -155,4 +156,4 @@ print(tokenizer.decode(outputs[0]))
|
||||
## BertGenerationDecoder
|
||||
|
||||
[[autodoc]] BertGenerationDecoder
|
||||
- forward
|
||||
- forward
|
||||
|
@ -81,7 +81,6 @@ API reference information.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
## BertJapaneseTokenizer
|
||||
|
||||
[[autodoc]] BertJapaneseTokenizer
|
||||
|
@ -24,8 +24,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
## BERTweet
|
||||
|
||||
[BERTweet](https://huggingface.co/papers/2005.10200) shares the same architecture as [BERT-base](./bert), but it’s pretrained like [RoBERTa](./roberta) on English Tweets. It performs really well on Tweet-related tasks like part-of-speech tagging, named entity recognition, and text classification.
|
||||
|
||||
[BERTweet](https://huggingface.co/papers/2005.10200) shares the same architecture as [BERT-base](./bert), but it's pretrained like [RoBERTa](./roberta) on English Tweets. It performs really well on Tweet-related tasks like part-of-speech tagging, named entity recognition, and text classification.
|
||||
|
||||
You can find all the original BERTweet checkpoints under the [VinAI Research](https://huggingface.co/vinai?search_models=BERTweet) organization.
|
||||
|
||||
@ -49,6 +48,7 @@ pipeline = pipeline(
|
||||
)
|
||||
pipeline("Plants create <mask> through a process known as photosynthesis.")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -88,7 +88,8 @@ echo -e "Plants create <mask> through a process known as photosynthesis." | tran
|
||||
</hfoptions>
|
||||
|
||||
## Notes
|
||||
- Use the [`AutoTokenizer`] or [`BertweetTokenizer`] because it’s preloaded with a custom vocabulary adapted to tweet-specific tokens like hashtags (#), mentions (@), emojis, and common abbreviations. Make sure to also install the [emoji](https://pypi.org/project/emoji/) library.
|
||||
|
||||
- Use the [`AutoTokenizer`] or [`BertweetTokenizer`] because it's preloaded with a custom vocabulary adapted to tweet-specific tokens like hashtags (#), mentions (@), emojis, and common abbreviations. Make sure to also install the [emoji](https://pypi.org/project/emoji/) library.
|
||||
- Inputs should be padded on the right (`padding="max_length"`) because BERT uses absolute position embeddings.
|
||||
|
||||
## BertweetTokenizer
|
||||
|
@ -47,6 +47,7 @@ pipeline = pipeline(
|
||||
)
|
||||
pipeline("Plants create [MASK] through a process known as photosynthesis.")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -81,10 +82,12 @@ print(f"The predicted token is: {predicted_token}")
|
||||
```bash
|
||||
!echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google/bigbird-roberta-base --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Notes
|
||||
|
||||
- Inputs should be padded on the right because BigBird uses absolute position embeddings.
|
||||
- BigBird supports `original_full` and `block_sparse` attention. If the input sequence length is less than 1024, it is recommended to use `original_full` since sparse patterns don't offer much benefit for smaller inputs.
|
||||
- The current implementation uses window size of 3 blocks and 2 global blocks, only supports the ITC-implementation, and doesn't support `num_random_blocks=0`.
|
||||
|
@ -52,6 +52,7 @@ Through photosynthesis, plants capture energy from sunlight using a green pigmen
|
||||
These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
|
||||
This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -77,6 +78,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
|
||||
output = model.generate(**input_ids, cache_implementation="static")
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
|
||||
|
@ -135,31 +135,26 @@ print(output)
|
||||
|
||||
[[autodoc]] BioGptConfig
|
||||
|
||||
|
||||
## BioGptTokenizer
|
||||
|
||||
[[autodoc]] BioGptTokenizer
|
||||
- save_vocabulary
|
||||
|
||||
|
||||
## BioGptModel
|
||||
|
||||
[[autodoc]] BioGptModel
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForCausalLM
|
||||
|
||||
[[autodoc]] BioGptForCausalLM
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForTokenClassification
|
||||
|
||||
[[autodoc]] BioGptForTokenClassification
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForSequenceClassification
|
||||
|
||||
[[autodoc]] BioGptForSequenceClassification
|
||||
|
@ -36,6 +36,7 @@ The original code can be found [here](https://github.com/google-research/big_tra
|
||||
## Usage tips
|
||||
|
||||
- BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by [group normalization](https://huggingface.co/papers/1803.08494),
|
||||
|
||||
2) [weight standardization](https://huggingface.co/papers/1903.10520) is used for convolutional layers. The authors show that the combination of both is useful for training with large batch sizes, and has a significant
|
||||
impact on transfer learning.
|
||||
|
||||
@ -72,4 +73,4 @@ If you're interested in submitting a resource to be included here, please feel f
|
||||
## BitForImageClassification
|
||||
|
||||
[[autodoc]] BitForImageClassification
|
||||
- forward
|
||||
- forward
|
||||
|
@ -35,33 +35,29 @@ Several versions of the model weights are available on Hugging Face:
|
||||
|
||||
* [**`microsoft/bitnet-b1.58-2B-4T-gguf`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-gguf): Contains the model weights in GGUF format, compatible with the `bitnet.cpp` library for CPU inference.
|
||||
|
||||
|
||||
### Model Details
|
||||
|
||||
|
||||
* **Architecture:** Transformer-based, modified with `BitLinear` layers (BitNet framework).
|
||||
* Uses Rotary Position Embeddings (RoPE).
|
||||
* Uses squared ReLU (ReLU²) activation in FFN layers.
|
||||
* Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
|
||||
* No bias terms in linear or normalization layers.
|
||||
* Uses Rotary Position Embeddings (RoPE).
|
||||
* Uses squared ReLU (ReLU²) activation in FFN layers.
|
||||
* Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
|
||||
* No bias terms in linear or normalization layers.
|
||||
* **Quantization:** Native 1.58-bit weights and 8-bit activations (W1.58A8).
|
||||
* Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
|
||||
* Activations are quantized to 8-bit integers using absmax quantization (per-token).
|
||||
* **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
|
||||
* Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
|
||||
* Activations are quantized to 8-bit integers using absmax quantization (per-token).
|
||||
* **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
|
||||
* **Parameters:** ~2 Billion
|
||||
* **Training Tokens:** 4 Trillion
|
||||
* **Context Length:** Maximum sequence length of **4096 tokens**.
|
||||
* *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
|
||||
* **Context Length:** Maximum sequence length of **4096 tokens**.
|
||||
* *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
|
||||
* **Training Stages:**
|
||||
1. **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
|
||||
2. **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
|
||||
3. **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
|
||||
1. **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
|
||||
2. **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
|
||||
3. **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
|
||||
* **Tokenizer:** LLaMA 3 Tokenizer (vocab size: 128,256).
|
||||
|
||||
|
||||
## Usage tips
|
||||
|
||||
|
||||
**VERY IMPORTANT NOTE ON EFFICIENCY**
|
||||
|
||||
> Please do NOT expect performance efficiency gains (in terms of speed, latency, or energy consumption) when using this model with the standard transformers library.
|
||||
@ -106,7 +102,6 @@ response = tokenizer.decode(chat_outputs[0][chat_input.shape[-1]:], skip_special
|
||||
print("\nAssistant Response:", response)
|
||||
```
|
||||
|
||||
|
||||
## BitNetConfig
|
||||
|
||||
[[autodoc]] BitNetConfig
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user