mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-22 02:08:58 +08:00
Compare commits
2 Commits
remove_dat
...
v4.57.0
Author | SHA1 | Date | |
---|---|---|---|
8ac2b916b0 | |||
2ccc6cae21 |
@ -16,10 +16,9 @@
|
||||
import argparse
|
||||
import copy
|
||||
import os
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
import glob
|
||||
from typing import Any, Optional
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@ -30,6 +29,7 @@ COMMON_ENV_VARIABLES = {
|
||||
"RUN_PIPELINE_TESTS": False,
|
||||
# will be adjust in `CircleCIJob.to_dict`.
|
||||
"RUN_FLAKY": True,
|
||||
"DISABLE_SAFETENSORS_CONVERSION": True,
|
||||
}
|
||||
# Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
|
||||
COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
|
||||
@ -82,15 +82,15 @@ class EmptyJob:
|
||||
@dataclass
|
||||
class CircleCIJob:
|
||||
name: str
|
||||
additional_env: Dict[str, Any] = None
|
||||
docker_image: List[Dict[str, str]] = None
|
||||
install_steps: List[str] = None
|
||||
additional_env: dict[str, Any] = None
|
||||
docker_image: list[dict[str, str]] = None
|
||||
install_steps: list[str] = None
|
||||
marker: Optional[str] = None
|
||||
parallelism: Optional[int] = 0
|
||||
pytest_num_workers: int = 8
|
||||
pytest_options: Dict[str, Any] = None
|
||||
pytest_options: dict[str, Any] = None
|
||||
resource_class: Optional[str] = "xlarge"
|
||||
tests_to_run: Optional[List[str]] = None
|
||||
tests_to_run: Optional[list[str]] = None
|
||||
num_test_files_per_worker: Optional[int] = 10
|
||||
# This should be only used for doctest job!
|
||||
command_timeout: Optional[int] = None
|
||||
@ -130,6 +130,12 @@ class CircleCIJob:
|
||||
|
||||
def to_dict(self):
|
||||
env = COMMON_ENV_VARIABLES.copy()
|
||||
if self.job_name != "tests_hub":
|
||||
# fmt: off
|
||||
# not critical
|
||||
env.update({"HF_TOKEN": "".join(["h", "f", "_", "H", "o", "d", "V", "u", "M", "q", "b", "R", "m", "t", "b", "z", "F", "Q", "O", "Q", "A", "J", "G", "D", "l", "V", "Q", "r", "R", "N", "w", "D", "M", "V", "C", "s", "d"])})
|
||||
# fmt: on
|
||||
|
||||
# Do not run tests decorated by @is_flaky on pull requests
|
||||
env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
|
||||
env.update(self.additional_env)
|
||||
@ -149,7 +155,7 @@ class CircleCIJob:
|
||||
# Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
|
||||
timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
|
||||
marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
|
||||
junit_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
|
||||
junit_flags = " -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
|
||||
joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS)
|
||||
repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'"
|
||||
parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
|
||||
@ -180,6 +186,7 @@ class CircleCIJob:
|
||||
# During the CircleCI docker images build time, we might already (or not) download the data.
|
||||
# If it's done already, the files are inside the directory `/test_data/`.
|
||||
{"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
|
||||
{"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
|
||||
{"run": {
|
||||
"name": "Run tests",
|
||||
"command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
|
||||
@ -200,9 +207,9 @@ class CircleCIJob:
|
||||
fi"""
|
||||
},
|
||||
},
|
||||
{"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
|
||||
{"run": {"name": "Failed tests: show reasons", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
|
||||
{"run": {"name": "Errors", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
|
||||
{"run": {"name": "Expand to show skipped tests", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
|
||||
{"run": {"name": "Failed tests: show reasons", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
|
||||
{"run": {"name": "Errors", "when": "always", "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
|
||||
{"store_test_results": {"path": "test-results"}},
|
||||
{"store_artifacts": {"path": "test-results/junit.xml"}},
|
||||
{"store_artifacts": {"path": "reports"}},
|
||||
|
@ -1,5 +1,6 @@
|
||||
import re
|
||||
import argparse
|
||||
import re
|
||||
|
||||
|
||||
def parse_pytest_output(file_path):
|
||||
skipped_tests = {}
|
||||
|
1
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
1
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
@ -61,6 +61,7 @@ body:
|
||||
- Big Model Inference: @SunMarc
|
||||
- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
|
||||
- kernels: @MekkCyber @drbh
|
||||
- peft: @BenjaminBossan @githubnemo
|
||||
|
||||
Devices/Backends:
|
||||
|
||||
|
40
.github/PULL_REQUEST_TEMPLATE.md
vendored
40
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -39,20 +39,23 @@ members/contributors who may be interested in your PR.
|
||||
|
||||
Models:
|
||||
|
||||
- text models: @ArthurZucker
|
||||
- vision models: @amyeroberts, @qubvel
|
||||
- speech models: @eustlb
|
||||
- text models: @ArthurZucker @Cyrilvallez
|
||||
- vision models: @yonigozlan @molbap
|
||||
- audio models: @eustlb @ebezzam @vasqu
|
||||
- multimodal models: @zucchini-nlp
|
||||
- graph models: @clefourrier
|
||||
|
||||
Library:
|
||||
|
||||
- flax: @gante and @Rocketknight1
|
||||
- generate: @zucchini-nlp (visual-language models) or @gante (all others)
|
||||
- continuous batching: @remi-or @ArthurZucker @McPatate
|
||||
- pipelines: @Rocketknight1
|
||||
- tensorflow: @gante and @Rocketknight1
|
||||
- tokenizers: @ArthurZucker
|
||||
- trainer: @zach-huggingface, @SunMarc and @qgallouedec
|
||||
- chat templates: @Rocketknight1
|
||||
- tokenizers: @ArthurZucker and @itazap
|
||||
- trainer: @zach-huggingface @SunMarc
|
||||
- attention: @vasqu @ArthurZucker @CyrilVallez
|
||||
- model loading (from pretrained, etc): @CyrilVallez
|
||||
- distributed: @3outeille @ArthurZucker @S1ro1
|
||||
- CIs: @ydshieh
|
||||
|
||||
Integrations:
|
||||
|
||||
@ -60,20 +63,17 @@ Integrations:
|
||||
- ray/raytune: @richardliaw, @amogkam
|
||||
- Big Model Inference: @SunMarc
|
||||
- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
|
||||
- kernels: @MekkCyber @drbh
|
||||
- peft: @BenjaminBossan @githubnemo
|
||||
|
||||
Devices/Backends:
|
||||
|
||||
- AMD ROCm: @ivarflakstad
|
||||
- Intel XPU: @IlyasMoutawwakil
|
||||
- Ascend NPU: @ivarflakstad
|
||||
|
||||
Documentation: @stevhliu
|
||||
|
||||
HF projects:
|
||||
|
||||
- accelerate: [different repo](https://github.com/huggingface/accelerate)
|
||||
- datasets: [different repo](https://github.com/huggingface/datasets)
|
||||
- diffusers: [different repo](https://github.com/huggingface/diffusers)
|
||||
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
|
||||
|
||||
Maintained examples (not research project or legacy):
|
||||
|
||||
- Flax: @Rocketknight1
|
||||
- PyTorch: See Models above and tag the person corresponding to the modality of the example.
|
||||
- TensorFlow: @Rocketknight1
|
||||
Research projects are not maintained and should be taken as is.
|
||||
|
||||
-->
|
||||
|
8
.github/scripts/assign_reviewers.py
vendored
8
.github/scripts/assign_reviewers.py
vendored
@ -13,14 +13,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import github
|
||||
import json
|
||||
from github import Github
|
||||
import os
|
||||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import github
|
||||
from github import Github
|
||||
|
||||
|
||||
def pattern_to_regex(pattern):
|
||||
if pattern.startswith("/"):
|
||||
start_anchor = True
|
||||
|
130
.github/scripts/codeowners_for_review_action
vendored
130
.github/scripts/codeowners_for_review_action
vendored
@ -7,8 +7,8 @@ docs/ @stevhliu
|
||||
/docker/ @ydshieh @ArthurZucker
|
||||
|
||||
# More high-level globs catch cases when specific rules later don't apply
|
||||
/src/transformers/models/*/processing* @molbap @yonigozlan @qubvel
|
||||
/src/transformers/models/*/image_processing* @qubvel
|
||||
/src/transformers/models/*/processing* @molbap @yonigozlan
|
||||
/src/transformers/models/*/image_processing* @yonigozlan
|
||||
/src/transformers/models/*/image_processing_*_fast* @yonigozlan
|
||||
|
||||
# Owners of subsections of the library
|
||||
@ -186,65 +186,65 @@ trainer_utils.py @zach-huggingface @SunMarc
|
||||
/src/transformers/models/zamba/mod*_zamba* @ArthurZucker
|
||||
|
||||
# Vision models
|
||||
/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel
|
||||
/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel
|
||||
/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel
|
||||
/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel
|
||||
/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel
|
||||
/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel
|
||||
/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel
|
||||
/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel
|
||||
/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel
|
||||
/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel
|
||||
/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel
|
||||
/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel
|
||||
/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel
|
||||
/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel
|
||||
/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel
|
||||
/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel
|
||||
/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel
|
||||
/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel
|
||||
/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel
|
||||
/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel
|
||||
/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel
|
||||
/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel
|
||||
/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel
|
||||
/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel
|
||||
/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel
|
||||
/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel
|
||||
/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel
|
||||
/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel
|
||||
/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel
|
||||
/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel
|
||||
/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel
|
||||
/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel
|
||||
/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel
|
||||
/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel
|
||||
/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel
|
||||
/src/transformers/models/van/mod*_van* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel
|
||||
/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel
|
||||
/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel
|
||||
/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel
|
||||
/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel
|
||||
/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel
|
||||
/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel
|
||||
/src/transformers/models/beit/mod*_beit* @yonigozlan @molbap
|
||||
/src/transformers/models/bit/mod*_bit* @yonigozlan @molbap
|
||||
/src/transformers/models/conditional_detr/mod*_conditional_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/convnext/mod*_convnext* @yonigozlan @molbap
|
||||
/src/transformers/models/convnextv2/mod*_convnextv2* @yonigozlan @molbap
|
||||
/src/transformers/models/cvt/mod*_cvt* @yonigozlan @molbap
|
||||
/src/transformers/models/deformable_detr/mod*_deformable_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/deit/mod*_deit* @yonigozlan @molbap
|
||||
/src/transformers/models/depth_anything/mod*_depth_anything* @yonigozlan @molbap
|
||||
/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @yonigozlan @molbap
|
||||
/src/transformers/models/deta/mod*_deta* @yonigozlan @molbap
|
||||
/src/transformers/models/detr/mod*_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/dinat/mod*_dinat* @yonigozlan @molbap
|
||||
/src/transformers/models/dinov2/mod*_dinov2* @yonigozlan @molbap
|
||||
/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @yonigozlan @molbap
|
||||
/src/transformers/models/dit/mod*_dit* @yonigozlan @molbap
|
||||
/src/transformers/models/dpt/mod*_dpt* @yonigozlan @molbap
|
||||
/src/transformers/models/efficientformer/mod*_efficientformer* @yonigozlan @molbap
|
||||
/src/transformers/models/efficientnet/mod*_efficientnet* @yonigozlan @molbap
|
||||
/src/transformers/models/focalnet/mod*_focalnet* @yonigozlan @molbap
|
||||
/src/transformers/models/glpn/mod*_glpn* @yonigozlan @molbap
|
||||
/src/transformers/models/hiera/mod*_hiera* @yonigozlan @molbap
|
||||
/src/transformers/models/ijepa/mod*_ijepa* @yonigozlan @molbap
|
||||
/src/transformers/models/imagegpt/mod*_imagegpt* @yonigozlan @molbap
|
||||
/src/transformers/models/levit/mod*_levit* @yonigozlan @molbap
|
||||
/src/transformers/models/mask2former/mod*_mask2former* @yonigozlan @molbap
|
||||
/src/transformers/models/maskformer/mod*_maskformer* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilevit/mod*_mobilevit* @yonigozlan @molbap
|
||||
/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @yonigozlan @molbap
|
||||
/src/transformers/models/nat/mod*_nat* @yonigozlan @molbap
|
||||
/src/transformers/models/poolformer/mod*_poolformer* @yonigozlan @molbap
|
||||
/src/transformers/models/pvt/mod*_pvt* @yonigozlan @molbap
|
||||
/src/transformers/models/pvt_v2/mod*_pvt_v2* @yonigozlan @molbap
|
||||
/src/transformers/models/regnet/mod*_regnet* @yonigozlan @molbap
|
||||
/src/transformers/models/resnet/mod*_resnet* @yonigozlan @molbap
|
||||
/src/transformers/models/rt_detr/mod*_rt_detr* @yonigozlan @molbap
|
||||
/src/transformers/models/segformer/mod*_segformer* @yonigozlan @molbap
|
||||
/src/transformers/models/seggpt/mod*_seggpt* @yonigozlan @molbap
|
||||
/src/transformers/models/superpoint/mod*_superpoint* @yonigozlan @molbap
|
||||
/src/transformers/models/swiftformer/mod*_swiftformer* @yonigozlan @molbap
|
||||
/src/transformers/models/swin/mod*_swin* @yonigozlan @molbap
|
||||
/src/transformers/models/swinv2/mod*_swinv2* @yonigozlan @molbap
|
||||
/src/transformers/models/swin2sr/mod*_swin2sr* @yonigozlan @molbap
|
||||
/src/transformers/models/table_transformer/mod*_table_transformer* @yonigozlan @molbap
|
||||
/src/transformers/models/textnet/mod*_textnet* @yonigozlan @molbap
|
||||
/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @yonigozlan @molbap
|
||||
/src/transformers/models/upernet/mod*_upernet* @yonigozlan @molbap
|
||||
/src/transformers/models/van/mod*_van* @yonigozlan @molbap
|
||||
/src/transformers/models/vit/mod*_vit* @yonigozlan @molbap
|
||||
/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @yonigozlan @molbap
|
||||
/src/transformers/models/vitdet/mod*_vitdet* @yonigozlan @molbap
|
||||
/src/transformers/models/vit_mae/mod*_vit_mae* @yonigozlan @molbap
|
||||
/src/transformers/models/vitmatte/mod*_vitmatte* @yonigozlan @molbap
|
||||
/src/transformers/models/vit_msn/mod*_vit_msn* @yonigozlan @molbap
|
||||
/src/transformers/models/vitpose/mod*_vitpose* @yonigozlan @molbap
|
||||
/src/transformers/models/yolos/mod*_yolos* @yonigozlan @molbap
|
||||
/src/transformers/models/zoedepth/mod*_zoedepth* @yonigozlan @molbap
|
||||
|
||||
# Audio models
|
||||
/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb
|
||||
@ -304,7 +304,7 @@ trainer_utils.py @zach-huggingface @SunMarc
|
||||
/src/transformers/models/donut/mod*_donut* @zucchini-nlp
|
||||
/src/transformers/models/flava/mod*_flava* @zucchini-nlp
|
||||
/src/transformers/models/git/mod*_git* @zucchini-nlp
|
||||
/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel
|
||||
/src/transformers/models/grounding_dino/mod*_grounding_dino* @yonigozlan
|
||||
/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp
|
||||
/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp
|
||||
/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp
|
||||
@ -326,10 +326,10 @@ trainer_utils.py @zach-huggingface @SunMarc
|
||||
/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp
|
||||
/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp
|
||||
/src/transformers/models/nougat/mod*_nougat* @NielsRogge
|
||||
/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan
|
||||
/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @yonigozlan
|
||||
/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp
|
||||
/src/transformers/models/owlvit/mod*_owlvit* @qubvel
|
||||
/src/transformers/models/owlv2/mod*_owlv2* @qubvel
|
||||
/src/transformers/models/owlvit/mod*_owlvit* @yonigozlan
|
||||
/src/transformers/models/owlv2/mod*_owlv2* @yonigozlan
|
||||
/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap
|
||||
/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp
|
||||
/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp
|
||||
|
85
.github/workflows/benchmark_v2.yml
vendored
Normal file
85
.github/workflows/benchmark_v2.yml
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
name: Benchmark v2 Framework
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
runner:
|
||||
description: 'GH Actions runner group to use'
|
||||
required: true
|
||||
type: string
|
||||
container_image:
|
||||
description: 'Docker image to use'
|
||||
required: true
|
||||
type: string
|
||||
container_options:
|
||||
description: 'Container options to use'
|
||||
required: true
|
||||
type: string
|
||||
commit_sha:
|
||||
description: 'Commit SHA to benchmark'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
run_id:
|
||||
description: 'Custom run ID for organizing results (auto-generated if not provided)'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
benchmark_repo_id:
|
||||
description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
|
||||
# This token is created under the bot `hf-transformers-bot`.
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
|
||||
jobs:
|
||||
benchmark-v2:
|
||||
name: Benchmark v2
|
||||
runs-on: ${{ inputs.runner }}
|
||||
if: |
|
||||
(github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) ||
|
||||
(github.event_name == 'schedule')
|
||||
container:
|
||||
image: ${{ inputs.container_image }}
|
||||
options: ${{ inputs.container_options }}
|
||||
steps:
|
||||
- name: Get repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Install benchmark dependencies
|
||||
run: |
|
||||
python3 -m pip install -r benchmark_v2/requirements.txt
|
||||
|
||||
- name: Reinstall transformers in edit mode
|
||||
run: |
|
||||
python3 -m pip uninstall -y transformers
|
||||
python3 -m pip install -e ".[torch]"
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
python3 -m pip list
|
||||
python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
|
||||
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
|
||||
python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true
|
||||
nvidia-smi || true
|
||||
|
||||
- name: Run benchmark v2
|
||||
working-directory: benchmark_v2
|
||||
run: |
|
||||
echo "Running benchmarks"
|
||||
python3 run_benchmarks.py \
|
||||
--commit-id '${{ inputs.commit_sha || github.sha }}' \
|
||||
--run-id '${{ inputs.run_id }}' \
|
||||
--push-to-hub '${{ inputs.benchmark_repo_id}}' \
|
||||
--token '${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}' \
|
||||
--log-level INFO
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
21
.github/workflows/benchmark_v2_a10_caller.yml
vendored
Normal file
21
.github/workflows/benchmark_v2_a10_caller.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
name: Benchmark v2 Scheduled Runner - A10 Single-GPU
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run daily at 16:30 UTC
|
||||
- cron: "30 16 * * *"
|
||||
pull_request:
|
||||
types: [ opened, labeled, reopened, synchronize ]
|
||||
|
||||
jobs:
|
||||
benchmark-v2-default:
|
||||
name: Benchmark v2 - Default Models
|
||||
uses: ./.github/workflows/benchmark_v2.yml
|
||||
with:
|
||||
runner: aws-g5-4xlarge-cache-use1-public-80
|
||||
container_image: huggingface/transformers-pytorch-gpu
|
||||
container_options: --gpus all --privileged --ipc host --shm-size "16gb"
|
||||
commit_sha: ${{ github.sha }}
|
||||
run_id: ${{ github.run_id }}
|
||||
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
|
||||
secrets: inherit
|
21
.github/workflows/benchmark_v2_mi325_caller.yml
vendored
Normal file
21
.github/workflows/benchmark_v2_mi325_caller.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
name: Benchmark v2 Scheduled Runner - MI325 Single-GPU
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run daily at 16:30 UTC
|
||||
- cron: "30 16 * * *"
|
||||
pull_request:
|
||||
types: [ opened, labeled, reopened, synchronize ]
|
||||
|
||||
jobs:
|
||||
benchmark-v2-default:
|
||||
name: Benchmark v2 - Default Models
|
||||
uses: ./.github/workflows/benchmark_v2.yml
|
||||
with:
|
||||
runner: amd-mi325-ci-1gpu
|
||||
container_image: huggingface/transformers-pytorch-amd-gpu
|
||||
container_options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache
|
||||
commit_sha: ${{ github.sha }}
|
||||
run_id: ${{ github.run_id }}
|
||||
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
|
||||
secrets: inherit
|
3
.github/workflows/build-docker-images.yml
vendored
3
.github/workflows/build-docker-images.yml
vendored
@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- build_ci_docker_image*
|
||||
repository_dispatch:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
image_postfix:
|
||||
@ -221,7 +222,7 @@ jobs:
|
||||
latest-pytorch-amd:
|
||||
name: "Latest PyTorch (AMD) [dev]"
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
group: aws-highcpu-32-priv
|
||||
steps:
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
|
14
.github/workflows/build_documentation.yml
vendored
14
.github/workflows/build_documentation.yml
vendored
@ -16,7 +16,19 @@ jobs:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: transformers
|
||||
notebook_folder: transformers_doc
|
||||
languages: ar de en es fr hi it ko pt tr zh ja te
|
||||
languages: en
|
||||
custom_container: huggingface/transformers-doc-builder
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
|
||||
|
||||
build_other_lang:
|
||||
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: transformers
|
||||
notebook_folder: transformers_doc
|
||||
languages: ar de es fr hi it ja ko pt zh
|
||||
custom_container: huggingface/transformers-doc-builder
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
|
35
.github/workflows/model_jobs.yml
vendored
35
.github/workflows/model_jobs.yml
vendored
@ -128,28 +128,47 @@ jobs:
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
echo "machine_type=$machine_type" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create report directory if it doesn't exist
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
|
||||
ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
run: |
|
||||
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
|
||||
ls -la
|
||||
# Extract the exit code from the output file
|
||||
EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
|
||||
exit ${EXIT_CODE:-1}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
# This step is only to show information on Github Actions log.
|
||||
# Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
- name: Captured information
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||
cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
|
||||
|
||||
- name: Copy test_outputs.txt
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
|
||||
collated_reports:
|
||||
name: Collated Reports
|
||||
|
@ -14,7 +14,7 @@ permissions: {}
|
||||
jobs:
|
||||
get-pr-number:
|
||||
name: Get PR number
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
|
||||
uses: ./.github/workflows/get-pr-number.yml
|
||||
|
||||
get-pr-info:
|
||||
|
2
.github/workflows/self-comment-ci.yml
vendored
2
.github/workflows/self-comment-ci.yml
vendored
@ -29,7 +29,7 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
name: Get PR number
|
||||
# For security: only allow team members to run
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||
outputs:
|
||||
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
||||
steps:
|
||||
|
@ -20,7 +20,7 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
@ -33,7 +33,7 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
@ -46,7 +46,7 @@ jobs:
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
@ -59,7 +59,7 @@ jobs:
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
runner_group: amd-mi325
|
||||
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
|
@ -20,7 +20,7 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
@ -32,7 +32,7 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
@ -44,7 +44,7 @@ jobs:
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -13,6 +13,7 @@ tests/fixtures/cached_*_text.txt
|
||||
logs/
|
||||
lightning_logs/
|
||||
lang_code_data/
|
||||
reports/
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
|
@ -278,13 +278,14 @@ are working on it).<br>
|
||||
useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
|
||||
☐ Make sure existing tests pass.<br>
|
||||
☐ If adding a new feature, also add tests for it.<br>
|
||||
- If you are adding a new model, make sure you use
|
||||
|
||||
- If you are adding a new model, make sure you use
|
||||
`ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
|
||||
- If you are adding new `@slow` tests, make sure they pass using
|
||||
- If you are adding new `@slow` tests, make sure they pass using
|
||||
`RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
|
||||
- If you are adding a new tokenizer, write tests and make sure
|
||||
- If you are adding a new tokenizer, write tests and make sure
|
||||
`RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
|
||||
- CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
|
||||
- CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
|
||||
|
||||
☐ All public methods must have informative docstrings (see
|
||||
[`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
|
||||
@ -340,6 +341,7 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
|
||||
```
|
||||
|
||||
Like the slow tests, there are other environment variables available which are not enabled by default during testing:
|
||||
|
||||
- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
|
||||
|
||||
More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
|
||||
|
@ -38,7 +38,6 @@ In particular all "Please explain" questions or objectively very user-specific f
|
||||
|
||||
* "How to train T5 on De->En translation?"
|
||||
|
||||
|
||||
## The GitHub Issues
|
||||
|
||||
Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
|
||||
@ -247,7 +246,6 @@ You are not required to read the following guidelines before opening an issue. H
|
||||
|
||||
Try not use italics and bold text too much as these often make the text more difficult to read.
|
||||
|
||||
|
||||
12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
|
||||
|
||||
To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
|
||||
@ -257,7 +255,6 @@ You are not required to read the following guidelines before opening an issue. H
|
||||
1. https://github.com/huggingface/transformers/issues/9257
|
||||
2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
|
||||
|
||||
|
||||
13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
|
||||
|
||||
But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
|
||||
|
@ -48,9 +48,11 @@ limitations under the License.
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_it.md">Italiano</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_bn.md">বাংলা</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
@ -62,7 +64,6 @@ limitations under the License.
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
|
||||
</h3>
|
||||
|
||||
|
||||
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
|
||||
vision, audio, video, and multimodal model, for both inference and training.
|
||||
|
||||
@ -110,10 +111,10 @@ git clone https://github.com/huggingface/transformers.git
|
||||
cd transformers
|
||||
|
||||
# pip
|
||||
pip install .[torch]
|
||||
pip install '.[torch]'
|
||||
|
||||
# uv
|
||||
uv pip install .[torch]
|
||||
uv pip install '.[torch]'
|
||||
```
|
||||
|
||||
## Quickstart
|
||||
@ -193,7 +194,6 @@ pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.pn
|
||||
<details>
|
||||
<summary>Visual question answering</summary>
|
||||
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
|
||||
</h3>
|
||||
|
@ -606,4 +606,3 @@ Keywords: BentoML, Framework, Deployment, AI Applications
|
||||
[LLaMA Factory](https://github.com/hiyouga/LLaMA-Factory) offers a user-friendly fine-tuning framework that incorporates PEFT. The repository includes training(fine-tuning) and inference examples for LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, and other LLMs. A ChatGLM version is also available in [ChatGLM-Efficient-Tuning](https://github.com/hiyouga/ChatGLM-Efficient-Tuning).
|
||||
|
||||
Keywords: PEFT, fine-tuning, LLaMA-2, ChatGLM, Qwen
|
||||
|
||||
|
@ -21,6 +21,46 @@ python run_benchmarks.py \
|
||||
--num-tokens-to-generate 200
|
||||
```
|
||||
|
||||
### Uploading Results to HuggingFace Dataset
|
||||
|
||||
You can automatically upload benchmark results to a HuggingFace Dataset for tracking and analysis:
|
||||
|
||||
```bash
|
||||
# Upload to a public dataset with auto-generated run ID
|
||||
python run_benchmarks.py --upload-to-hub username/benchmark-results
|
||||
|
||||
# Upload with a custom run ID for easy identification
|
||||
python run_benchmarks.py --upload-to-hub username/benchmark-results --run-id experiment_v1
|
||||
|
||||
# Upload with custom HuggingFace token (if not set in environment)
|
||||
python run_benchmarks.py --upload-to-hub username/benchmark-results --token hf_your_token_here
|
||||
```
|
||||
|
||||
**Dataset Directory Structure:**
|
||||
```
|
||||
dataset_name/
|
||||
├── 2025-01-15/
|
||||
│ ├── runs/ # Non-scheduled runs (manual, PR, etc.)
|
||||
│ │ └── 123-1245151651/ # GitHub run number and ID
|
||||
│ │ └── benchmark_results/
|
||||
│ │ ├── benchmark_summary_20250115_143022.json
|
||||
│ │ └── model-name/
|
||||
│ │ └── model-name_benchmark_20250115_143022.json
|
||||
│ └── benchmark_results_abc123de/ # Scheduled runs (daily CI)
|
||||
│ ├── benchmark_summary_20250115_143022.json
|
||||
│ └── model-name/
|
||||
│ └── model-name_benchmark_20250115_143022.json
|
||||
└── 2025-01-16/
|
||||
└── ...
|
||||
```
|
||||
|
||||
**Authentication for Uploads:**
|
||||
|
||||
For uploading results, you need a HuggingFace token with write permissions to the target dataset. You can provide the token in several ways (in order of precedence):
|
||||
|
||||
1. Command line: `--token hf_your_token_here`
|
||||
3. Environment variable: `HF_TOKEN`
|
||||
|
||||
### Running Specific Benchmarks
|
||||
|
||||
```bash
|
||||
|
@ -20,7 +20,6 @@ import torch
|
||||
from benchmark_framework import ModelBenchmark
|
||||
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
|
@ -4,3 +4,4 @@ gpustat>=1.0.0
|
||||
torch>=2.0.0
|
||||
transformers>=4.30.0
|
||||
datasets>=2.10.0
|
||||
huggingface_hub>=0.16.0
|
@ -24,6 +24,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
@ -160,7 +161,12 @@ def run_single_benchmark(
|
||||
return None
|
||||
|
||||
|
||||
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
|
||||
def generate_summary_report(
|
||||
output_dir: str,
|
||||
benchmark_results: dict[str, Any],
|
||||
logger: logging.Logger,
|
||||
benchmark_run_uuid: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Generate a summary report of all benchmark runs."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
|
||||
@ -168,6 +174,7 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
|
||||
summary_data = {
|
||||
"run_metadata": {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"benchmark_run_uuid": benchmark_run_uuid,
|
||||
"total_benchmarks": len(benchmark_results),
|
||||
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
|
||||
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
|
||||
@ -183,9 +190,114 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
|
||||
return summary_file
|
||||
|
||||
|
||||
def upload_results_to_hf_dataset(
|
||||
output_dir: str,
|
||||
summary_file: str,
|
||||
dataset_name: str,
|
||||
run_id: Optional[str] = None,
|
||||
token: Optional[str] = None,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Upload benchmark results to a HuggingFace Dataset.
|
||||
Based on upload_collated_report() from utils/collated_reports.py
|
||||
Args:
|
||||
output_dir: Local output directory containing results
|
||||
summary_file: Path to the summary file
|
||||
dataset_name: Name of the HuggingFace dataset to upload to
|
||||
run_id: Unique run identifier (if None, will generate one)
|
||||
token: HuggingFace token for authentication (if None, will use environment variables)
|
||||
logger: Logger instance
|
||||
Returns:
|
||||
The run_id used for the upload, None if upload failed
|
||||
"""
|
||||
if logger is None:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import os
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
api = HfApi()
|
||||
|
||||
if run_id is None:
|
||||
github_run_number = os.getenv("GITHUB_RUN_NUMBER")
|
||||
github_run_id = os.getenv("GITHUB_RUN_ID")
|
||||
if github_run_number and github_run_id:
|
||||
run_id = f"{github_run_number}-{github_run_id}"
|
||||
|
||||
date_folder = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
github_event_name = os.getenv("GITHUB_EVENT_NAME")
|
||||
if github_event_name != "schedule":
|
||||
# Non-scheduled runs go under a runs subfolder
|
||||
repo_path = f"{date_folder}/runs/{run_id}/benchmark_results"
|
||||
else:
|
||||
# Scheduled runs go directly under the date
|
||||
repo_path = f"{date_folder}/{run_id}/benchmark_results"
|
||||
|
||||
logger.info(f"Uploading benchmark results to dataset '{dataset_name}' at path '{repo_path}'")
|
||||
|
||||
try:
|
||||
# Upload all files in the output directory
|
||||
from pathlib import Path
|
||||
|
||||
output_path = Path(output_dir)
|
||||
|
||||
for file_path in output_path.rglob("*"):
|
||||
if file_path.is_file():
|
||||
# Calculate relative path from output_dir
|
||||
relative_path = file_path.relative_to(output_path)
|
||||
path_in_repo = f"{repo_path}/{relative_path}"
|
||||
|
||||
logger.debug(f"Uploading {file_path} to {path_in_repo}")
|
||||
|
||||
api.upload_file(
|
||||
path_or_fileobj=str(file_path),
|
||||
path_in_repo=path_in_repo,
|
||||
repo_id=dataset_name,
|
||||
repo_type="dataset",
|
||||
token=token,
|
||||
commit_message=f"Upload benchmark results for run {run_id}",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully uploaded results to: https://huggingface.co/datasets/{dataset_name}/tree/main/{repo_path}"
|
||||
)
|
||||
|
||||
return run_id
|
||||
|
||||
except Exception as upload_error:
|
||||
logger.error(f"Failed to upload results: {upload_error}")
|
||||
import traceback
|
||||
|
||||
logger.debug(traceback.format_exc())
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the benchmarking script."""
|
||||
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
|
||||
# Generate a unique UUID for this benchmark run
|
||||
benchmark_run_uuid = str(uuid.uuid4())[:8]
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run all benchmarks in the ./benches directory",
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run all available benchmarks
|
||||
python3 run_benchmarks.py
|
||||
|
||||
# Run with specific model and upload to HuggingFace Dataset
|
||||
python3 run_benchmarks.py --model-id meta-llama/Llama-2-7b-hf --upload-to-hf username/benchmark-results
|
||||
|
||||
# Run with custom run ID and upload to HuggingFace Dataset
|
||||
python3 run_benchmarks.py --run-id experiment_v1 --upload-to-hf org/benchmarks
|
||||
|
||||
# Run only specific benchmarks with file logging
|
||||
python3 run_benchmarks.py --include llama --enable-file-logging
|
||||
""", # noqa: W293
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
@ -228,20 +340,35 @@ def main():
|
||||
|
||||
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
|
||||
|
||||
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
|
||||
|
||||
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
|
||||
|
||||
parser.add_argument(
|
||||
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push-to-hub",
|
||||
type=str,
|
||||
help="Upload results to HuggingFace Dataset (provide dataset name, e.g., 'username/benchmark-results')",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--run-id", type=str, help="Custom run ID for organizing results (if not provided, will generate a unique ID)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--token",
|
||||
type=str,
|
||||
help="HuggingFace token for dataset uploads (if not provided, will use HF_TOKEN environment variable)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
logger = setup_logging(args.log_level, args.enable_file_logging)
|
||||
|
||||
logger.info("Starting benchmark discovery and execution")
|
||||
logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
|
||||
logger.info(f"Output directory: {args.output_dir}")
|
||||
logger.info(f"Benches directory: {args.benches_dir}")
|
||||
|
||||
@ -286,9 +413,6 @@ def main():
|
||||
if args.model_id:
|
||||
benchmark_kwargs["model_id"] = args.model_id
|
||||
|
||||
# Add enable_mock flag for mock benchmark
|
||||
benchmark_kwargs["enable_mock"] = args.enable_mock
|
||||
|
||||
# Add commit_id if provided
|
||||
if args.commit_id:
|
||||
benchmark_kwargs["commit_id"] = args.commit_id
|
||||
@ -306,7 +430,28 @@ def main():
|
||||
successful_count += 1
|
||||
|
||||
# Generate summary report
|
||||
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger)
|
||||
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger, benchmark_run_uuid)
|
||||
|
||||
# Upload results to HuggingFace Dataset if requested
|
||||
upload_run_id = None
|
||||
if args.push_to_hub:
|
||||
logger.info("=" * 60)
|
||||
logger.info("UPLOADING TO HUGGINGFACE DATASET")
|
||||
logger.info("=" * 60)
|
||||
# Use provided run_id or fallback to benchmark run UUID
|
||||
effective_run_id = args.run_id or benchmark_run_uuid
|
||||
upload_run_id = upload_results_to_hf_dataset(
|
||||
output_dir=args.output_dir,
|
||||
summary_file=summary_file,
|
||||
dataset_name=args.push_to_hub,
|
||||
run_id=effective_run_id,
|
||||
token=args.token,
|
||||
logger=logger,
|
||||
)
|
||||
if upload_run_id:
|
||||
logger.info(f"Upload completed with run ID: {upload_run_id}")
|
||||
else:
|
||||
logger.warning("Upload failed - continuing with local results")
|
||||
|
||||
# Final summary
|
||||
total_benchmarks = len(filtered_benchmarks)
|
||||
@ -321,6 +466,16 @@ def main():
|
||||
logger.info(f"Output directory: {args.output_dir}")
|
||||
logger.info(f"Summary report: {summary_file}")
|
||||
|
||||
if args.push_to_hub:
|
||||
if upload_run_id:
|
||||
logger.info(f"HuggingFace Dataset: {args.push_to_hub}")
|
||||
logger.info(f"Run ID: {upload_run_id}")
|
||||
logger.info(
|
||||
f"View results: https://huggingface.co/datasets/{args.push_to_hub}/tree/main/{datetime.now().strftime('%Y-%m-%d')}/runs/{upload_run_id}"
|
||||
)
|
||||
else:
|
||||
logger.warning("Upload to HuggingFace Dataset failed")
|
||||
|
||||
if failed_count > 0:
|
||||
logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
|
||||
return 1
|
||||
|
@ -54,7 +54,6 @@ NOT_DEVICE_TESTS = {
|
||||
"test_gradient_checkpointing_backward_compatibility",
|
||||
"test_gradient_checkpointing_enable_disable",
|
||||
"test_torch_save_load",
|
||||
"test_initialization",
|
||||
"test_forward_signature",
|
||||
"test_model_get_set_embeddings",
|
||||
"test_model_main_input_name",
|
||||
@ -64,8 +63,7 @@ NOT_DEVICE_TESTS = {
|
||||
"test_load_save_without_tied_weights",
|
||||
"test_tied_weights_keys",
|
||||
"test_model_weights_reload_no_missing_tied_weights",
|
||||
"test_mismatched_shapes_have_properly_initialized_weights",
|
||||
"test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
|
||||
"test_can_load_ignoring_mismatched_shapes",
|
||||
"test_model_is_small",
|
||||
"test_tf_from_pt_safetensors",
|
||||
"test_flax_from_pt_safetensors",
|
||||
@ -93,6 +91,8 @@ def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
|
||||
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
|
||||
|
||||
os.environ['DISABLE_SAFETENSORS_CONVERSION'] = 'true'
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(items):
|
||||
for item in items:
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
USER root
|
||||
ARG REF=main
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM python:3.9-slim
|
||||
FROM python:3.10-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
|
@ -38,3 +38,10 @@ RUN python3 -m pip uninstall -y kernels
|
||||
|
||||
# On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails
|
||||
RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
|
||||
|
||||
# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
|
||||
RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
|
||||
cd flash-attention && \
|
||||
GPU_ARCHS="gfx942" python setup.py install
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
|
||||
FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
@ -9,9 +9,9 @@ SHELL ["sh", "-lc"]
|
||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||
# to be used as arguments for docker build (so far).
|
||||
|
||||
ARG PYTORCH='2.6.0'
|
||||
ARG PYTORCH='2.8.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu121'
|
||||
ARG CUDA='cu126'
|
||||
# Disable kernel mapping for quantization tests
|
||||
ENV DISABLE_KERNEL_MAPPING=1
|
||||
|
||||
@ -30,31 +30,20 @@ RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio tor
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
# needed in bnb and awq
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
||||
# Add bitsandbytes for mixed int8 testing
|
||||
RUN python3 -m pip install --no-cache-dir bitsandbytes
|
||||
|
||||
# Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility
|
||||
RUN python3 -m pip install lm_eval
|
||||
RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation
|
||||
|
||||
# Add optimum for gptq quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
|
||||
|
||||
# Add PEFT
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
|
||||
|
||||
# Add aqlm for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
|
||||
# needed in bnb and awq
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
||||
# Add vptq for quantization testing
|
||||
RUN pip install vptq
|
||||
# Add bitsandbytes
|
||||
RUN python3 -m pip install --no-cache-dir bitsandbytes
|
||||
|
||||
# Add spqr for quantization testing
|
||||
# Commented for now as No matching distribution found we need to reach out to the authors
|
||||
# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
|
||||
# # Add gptqmodel
|
||||
# RUN python3 -m pip install --no-cache-dir gptqmodel
|
||||
|
||||
# Add hqq for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir hqq
|
||||
@ -63,25 +52,11 @@ RUN python3 -m pip install --no-cache-dir hqq
|
||||
RUN python3 -m pip install --no-cache-dir gguf
|
||||
|
||||
# Add autoawq for quantization testing
|
||||
# New release v0.2.8
|
||||
RUN python3 -m pip install --no-cache-dir autoawq[kernels]
|
||||
|
||||
# Add quanto for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir optimum-quanto
|
||||
|
||||
# Add eetq for quantization testing
|
||||
RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .
|
||||
|
||||
# # Add flute-kernel and fast_hadamard_transform for quantization testing
|
||||
# # Commented for now as they cause issues with the build
|
||||
# # TODO: create a new workflow to test them
|
||||
# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
|
||||
# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
|
||||
|
||||
# Add fp-quant for quantization testing
|
||||
# Requires py3.11 but our CI runs on 3.9
|
||||
# RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
|
||||
|
||||
# Add compressed-tensors for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir compressed-tensors
|
||||
|
||||
@ -89,7 +64,10 @@ RUN python3 -m pip install --no-cache-dir compressed-tensors
|
||||
RUN python3 -m pip install --no-cache-dir amd-quark
|
||||
|
||||
# Add AutoRound for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
|
||||
RUN python3 -m pip install --no-cache-dir auto-round
|
||||
|
||||
# Add torchao for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir torchao
|
||||
|
||||
# Add transformers in editable mode
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
|
||||
@ -103,3 +81,27 @@ RUN python3 -m pip uninstall -y flash-attn
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
RUN cd transformers && python3 setup.py develop
|
||||
|
||||
# Add fp-quant for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
|
||||
|
||||
# Low usage or incompatible lib, will enable later on
|
||||
|
||||
# # Add aqlm for quantization testing
|
||||
# RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
|
||||
|
||||
# # Add vptq for quantization testing
|
||||
# RUN pip install vptq
|
||||
|
||||
# Add spqr for quantization testing
|
||||
# Commented for now as No matching distribution found we need to reach out to the authors
|
||||
# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
|
||||
|
||||
# # Add eetq for quantization testing
|
||||
# RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .
|
||||
|
||||
# # Add flute-kernel and fast_hadamard_transform for quantization testing
|
||||
# # Commented for now as they cause issues with the build
|
||||
# # TODO: create a new workflow to test them
|
||||
# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
|
||||
# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
|
||||
|
@ -50,7 +50,7 @@ Begin translating the text!
|
||||
|
||||
1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website.
|
||||
|
||||
- If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections.
|
||||
- If the `_toctree.yml` file doesn't exist for your language, create one by copying the English version and removing unrelated sections.
|
||||
- Ensure it is placed in the `docs/source/LANG-ID/` directory.
|
||||
|
||||
Here’s an example structure for the `_toctree.yml` file:
|
||||
|
@ -307,6 +307,8 @@
|
||||
title: Glossary
|
||||
- local: philosophy
|
||||
title: Philosophy
|
||||
- local: models_timeline
|
||||
title: Models Timeline
|
||||
- local: notebooks
|
||||
title: Notebooks with examples
|
||||
- local: community
|
||||
@ -411,6 +413,8 @@
|
||||
title: Blenderbot Small
|
||||
- local: model_doc/bloom
|
||||
title: BLOOM
|
||||
- local: model_doc/blt
|
||||
title: BLT
|
||||
- local: model_doc/bort
|
||||
title: BORT
|
||||
- local: model_doc/byt5
|
||||
@ -441,6 +445,8 @@
|
||||
title: DeBERTa
|
||||
- local: model_doc/deberta-v2
|
||||
title: DeBERTa-v2
|
||||
- local: model_doc/deepseek_v2
|
||||
title: DeepSeek-V2
|
||||
- local: model_doc/deepseek_v3
|
||||
title: DeepSeek-V3
|
||||
- local: model_doc/dialogpt
|
||||
@ -763,12 +769,6 @@
|
||||
title: D-FINE
|
||||
- local: model_doc/dab-detr
|
||||
title: DAB-DETR
|
||||
- local: model_doc/deepseek_v2
|
||||
title: DeepSeek-V2
|
||||
- local: model_doc/deepseek_vl
|
||||
title: DeepseekVL
|
||||
- local: model_doc/deepseek_vl_hybrid
|
||||
title: DeepseekVLHybrid
|
||||
- local: model_doc/deformable_detr
|
||||
title: Deformable DETR
|
||||
- local: model_doc/deit
|
||||
@ -851,10 +851,16 @@
|
||||
title: RT-DETR
|
||||
- local: model_doc/rt_detr_v2
|
||||
title: RT-DETRv2
|
||||
- local: model_doc/sam2
|
||||
title: SAM2
|
||||
- local: model_doc/segformer
|
||||
title: SegFormer
|
||||
- local: model_doc/seggpt
|
||||
title: SegGpt
|
||||
- local: model_doc/sam
|
||||
title: Segment Anything
|
||||
- local: model_doc/sam_hq
|
||||
title: Segment Anything High Quality
|
||||
- local: model_doc/superglue
|
||||
title: SuperGlue
|
||||
- local: model_doc/superpoint
|
||||
@ -933,6 +939,8 @@
|
||||
title: MusicGen
|
||||
- local: model_doc/musicgen_melody
|
||||
title: MusicGen Melody
|
||||
- local: model_doc/parakeet
|
||||
title: Parakeet
|
||||
- local: model_doc/pop2piano
|
||||
title: Pop2Piano
|
||||
- local: model_doc/seamless_m4t
|
||||
@ -977,6 +985,8 @@
|
||||
title: XLSR-Wav2Vec2
|
||||
title: Audio models
|
||||
- sections:
|
||||
- local: model_doc/sam2_video
|
||||
title: SAM2 Video
|
||||
- local: model_doc/timesformer
|
||||
title: TimeSformer
|
||||
- local: model_doc/vjepa2
|
||||
@ -1021,10 +1031,18 @@
|
||||
title: ColQwen2
|
||||
- local: model_doc/data2vec
|
||||
title: Data2Vec
|
||||
- local: model_doc/deepseek_vl
|
||||
title: DeepseekVL
|
||||
- local: model_doc/deepseek_vl_hybrid
|
||||
title: DeepseekVLHybrid
|
||||
- local: model_doc/deplot
|
||||
title: DePlot
|
||||
- local: model_doc/donut
|
||||
title: Donut
|
||||
- local: model_doc/edgetam
|
||||
title: EdgeTAM
|
||||
- local: model_doc/edgetam_video
|
||||
title: EdgeTamVideo
|
||||
- local: model_doc/emu3
|
||||
title: Emu3
|
||||
- local: model_doc/evolla
|
||||
@ -1077,6 +1095,8 @@
|
||||
title: LayoutLMV3
|
||||
- local: model_doc/layoutxlm
|
||||
title: LayoutXLM
|
||||
- local: model_doc/lfm2_vl
|
||||
title: LFM2-VL
|
||||
- local: model_doc/lilt
|
||||
title: LiLT
|
||||
- local: model_doc/llama4
|
||||
@ -1135,18 +1155,12 @@
|
||||
title: Qwen2Audio
|
||||
- local: model_doc/qwen2_vl
|
||||
title: Qwen2VL
|
||||
- local: model_doc/qwen3_omni_moe
|
||||
title: Qwen3-Omni-MoE
|
||||
- local: model_doc/qwen3_vl
|
||||
title: Qwen3VL
|
||||
- local: model_doc/qwen3_vl_moe
|
||||
title: Qwen3VLMoe
|
||||
- local: model_doc/sam2
|
||||
title: SAM2
|
||||
- local: model_doc/sam2_video
|
||||
title: SAM2 Video
|
||||
- local: model_doc/sam
|
||||
title: Segment Anything
|
||||
- local: model_doc/sam_hq
|
||||
title: Segment Anything High Quality
|
||||
- local: model_doc/shieldgemma2
|
||||
title: ShieldGemma2
|
||||
- local: model_doc/siglip
|
||||
|
@ -69,7 +69,6 @@ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
||||
Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
|
||||
To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
|
||||
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
||||
```
|
||||
@ -108,7 +107,6 @@ To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
|
||||
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
||||
```
|
||||
|
||||
|
||||
You can also control the order of Intel XPUs with:
|
||||
|
||||
```bash
|
||||
@ -120,7 +118,5 @@ For more information about device enumeration and sorting on Intel XPU, please r
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
|
||||
> [!WARNING]
|
||||
> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
|
||||
|
@ -145,7 +145,6 @@ Arguments can also be passed directly to `@auto_docstring` for more control. Use
|
||||
|
||||
The `Returns` and `Examples` parts of the docstring can also be manually specified.
|
||||
|
||||
|
||||
```python
|
||||
MODEL_COMMON_CUSTOM_ARGS = r"""
|
||||
common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
|
||||
@ -202,7 +201,6 @@ There are some rules for documenting different types of arguments and they're li
|
||||
|
||||
If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
|
||||
|
||||
|
||||
- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
|
||||
|
||||
```py
|
||||
@ -212,9 +210,9 @@ There are some rules for documenting different types of arguments and they're li
|
||||
This can span multiple lines.
|
||||
```
|
||||
|
||||
* Include `type` in backticks.
|
||||
* Add *optional* if the argument is not required or has a default value.
|
||||
* Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
|
||||
* Include `type` in backticks.
|
||||
* Add *optional* if the argument is not required or has a default value.
|
||||
* Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
|
||||
|
||||
These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
||||
|
||||
|
@ -62,8 +62,6 @@ Refer to the table below to compare how caching improves efficiency.
|
||||
| for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V`
|
||||
| attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) |
|
||||
|
||||
|
||||
|
||||
## Cache class
|
||||
|
||||
A basic KV cache interface takes a key and value tensor for the current token and returns the updated `K` and `V` tensors. This is internally managed by a model's `forward` method.
|
||||
@ -138,12 +136,11 @@ The cache position tracks where to insert new tokens in the attention cache. It
|
||||
|
||||
Cache position is used internally for two purposes:
|
||||
|
||||
1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
|
||||
1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven't been cached yet are passed to the model's `forward`.
|
||||
2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, that pre-allocates a specific cache length.
|
||||
|
||||
The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
|
||||
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infer_device
|
||||
@ -160,12 +157,12 @@ generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## Legacy cache format
|
||||
|
||||
Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
|
||||
|
||||
The legacy format is essentially the same data structure but organized differently.
|
||||
|
||||
- It's a tuple of tuples, where each inner tuple contains the key and value tensors for a layer.
|
||||
- The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
|
||||
- The format is less flexible and doesn't support features like quantization or offloading.
|
||||
|
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Tool use
|
||||
|
||||
Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to it internally.
|
||||
Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to perform the computation internally.
|
||||
|
||||
This guide will demonstrate how to define tools, how to pass them to a chat model, and how to handle the model's output when it calls a tool.
|
||||
|
||||
@ -29,7 +29,6 @@ the arguments, argument types, and function docstring are parsed in order to gen
|
||||
Although passing Python functions is very convenient, the parser can only handle [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings)
|
||||
docstrings. Refer to the examples below for how to format a tool-ready function.
|
||||
|
||||
|
||||
```py
|
||||
def get_current_temperature(location: str, unit: str):
|
||||
"""
|
||||
@ -103,7 +102,6 @@ Hold the call in the `tool_calls` key of an `assistant` message. This is the rec
|
||||
> [!WARNING]
|
||||
> Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
|
||||
|
||||
|
||||
```py
|
||||
tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
|
||||
messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
|
||||
@ -131,7 +129,6 @@ The temperature in Paris, France right now is 22°C.<|im_end|>
|
||||
> Although the key in the assistant message is called `tool_calls`, in most cases, models only emit a single tool call at a time. Some older models emit multiple tool calls at the same time, but this is a
|
||||
> significantly more complex process, as you need to handle multiple tool responses at once and disambiguate them, often using tool call IDs. Please refer to the model card to see exactly what format a model expects for tool calls.
|
||||
|
||||
|
||||
## JSON schemas
|
||||
|
||||
Another way to define tools is by passing a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
|
||||
|
@ -43,6 +43,7 @@ chat = [
|
||||
|
||||
tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
```
|
||||
|
||||
```md
|
||||
<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]
|
||||
```
|
||||
@ -62,6 +63,7 @@ chat = [
|
||||
|
||||
tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
```
|
||||
|
||||
```md
|
||||
<|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n
|
||||
```
|
||||
@ -75,9 +77,9 @@ Mistral-7B-Instruct uses `[INST]` and `[/INST]` tokens to indicate the start and
|
||||
|
||||
The input to `apply_chat_template` should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker, and the `content` key contains the message. The common roles are:
|
||||
|
||||
- `user` for messages from the user
|
||||
- `assistant` for messages from the model
|
||||
- `system` for directives on how the model should act (usually placed at the beginning of the chat)
|
||||
- `user` for messages from the user
|
||||
- `assistant` for messages from the model
|
||||
- `system` for directives on how the model should act (usually placed at the beginning of the chat)
|
||||
|
||||
[`apply_chat_template`] takes this list and returns a formatted sequence. Set `tokenize=True` if you want to tokenize the sequence.
|
||||
|
||||
@ -110,6 +112,7 @@ Pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
|
||||
outputs = model.generate(tokenized_chat, max_new_tokens=128)
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
```
|
||||
|
||||
```md
|
||||
<|system|>
|
||||
You are a friendly chatbot who always responds in the style of a pirate</s>
|
||||
@ -121,7 +124,7 @@ Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopte
|
||||
|
||||
> [!WARNING]
|
||||
> Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` if you tokenize later to avoid duplicating these tokens.
|
||||
> This isn’t an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option!
|
||||
> This isn't an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option!
|
||||
|
||||
### add_generation_prompt
|
||||
|
||||
@ -135,6 +138,7 @@ Let's see an example to understand what `add_generation_prompt` is actually doin
|
||||
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
||||
tokenized_chat
|
||||
```
|
||||
|
||||
```md
|
||||
<|im_start|>user
|
||||
Hi there!<|im_end|>
|
||||
@ -150,6 +154,7 @@ Now, let's format the same chat with `add_generation_prompt=True`:
|
||||
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
tokenized_chat
|
||||
```
|
||||
|
||||
```md
|
||||
<|im_start|>user
|
||||
Hi there!<|im_end|>
|
||||
@ -163,7 +168,7 @@ Can I ask a question?<|im_end|>
|
||||
|
||||
When `add_generation_prompt=True`, `<|im_start|>assistant` is added at the end to indicate the start of an `assistant` message. This lets the model know an `assistant` response is next.
|
||||
|
||||
Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
|
||||
Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don't have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
|
||||
|
||||
### continue_final_message
|
||||
|
||||
@ -182,14 +187,13 @@ model.generate(**formatted_chat)
|
||||
```
|
||||
|
||||
> [!WARNING]
|
||||
> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
|
||||
|
||||
[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline.
|
||||
> You shouldn't use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
|
||||
|
||||
[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don't support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline.
|
||||
|
||||
## Model training
|
||||
|
||||
Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
|
||||
Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren't helpful during training.
|
||||
|
||||
An example of preprocessing a dataset with a chat template is shown below.
|
||||
|
||||
@ -212,6 +216,7 @@ dataset = Dataset.from_dict({"chat": [chat1, chat2]})
|
||||
dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
|
||||
print(dataset['formatted_chat'][0])
|
||||
```
|
||||
|
||||
```md
|
||||
<|user|>
|
||||
Which is bigger, the moon or the sun?</s>
|
||||
|
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
Multimodal chat models accept inputs like images, audio or video, in addition to text. The `content` key in a multimodal chat history is a list containing multiple items of different types. This is unlike text-only chat models whose `content` key is a single string.
|
||||
|
||||
|
||||
In the same way the [Tokenizer](./fast_tokenizer) class handles chat templates and tokenization for text-only models,
|
||||
the [Processor](./processors) class handles preprocessing, tokenization and chat templates for multimodal models. Their [`~ProcessorMixin.apply_chat_template`] methods are almost identical.
|
||||
|
||||
@ -46,7 +45,7 @@ messages = [
|
||||
]
|
||||
```
|
||||
|
||||
Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed.
|
||||
Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed.
|
||||
|
||||
```python
|
||||
import torch
|
||||
@ -57,8 +56,7 @@ out = pipe(text=messages, max_new_tokens=128)
|
||||
print(out[0]['generated_text'][-1]['content'])
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
```text
|
||||
Ahoy, me hearty! These be two feline friends, likely some tabby cats, taking a siesta on a cozy pink blanket. They're resting near remote controls, perhaps after watching some TV or just enjoying some quiet time together. Cats sure know how to find comfort and relaxation, don't they?
|
||||
```
|
||||
|
||||
@ -69,7 +67,6 @@ Aside from the gradual descent from pirate-speak into modern American English (i
|
||||
Like [text-only models](./chat_templating), use the [`~ProcessorMixin.apply_chat_template`] method to prepare the chat messages for multimodal models.
|
||||
This method handles the tokenization and formatting of the chat messages, including images and other media types. The resulting inputs are passed to the model for generation.
|
||||
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
|
||||
@ -99,8 +96,7 @@ processed_chat = processor.apply_chat_template(messages, add_generation_prompt=T
|
||||
print(list(processed_chat.keys()))
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
```text
|
||||
['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw']
|
||||
```
|
||||
|
||||
@ -113,14 +109,13 @@ print(processor.decode(out[0]))
|
||||
|
||||
The decoded output contains the full conversation so far, including the user message and the placeholder tokens that contain the image information. You may need to trim the previous conversation from the output before displaying it to the user.
|
||||
|
||||
|
||||
## Video inputs
|
||||
|
||||
Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
|
||||
|
||||
- The content `"type"` should be `"video"` to indicate the content is a video.
|
||||
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
||||
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
|
||||
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you've already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
|
||||
|
||||
> [!WARNING]
|
||||
> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
|
||||
@ -148,6 +143,7 @@ messages = [
|
||||
```
|
||||
|
||||
### Example: Passing decoded video objects
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
@ -167,7 +163,9 @@ messages = [
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages.
|
||||
|
||||
```python
|
||||
|
||||
# Make sure a video backend library (pyav, decord, or torchvision) is available.
|
||||
@ -200,7 +198,6 @@ Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input
|
||||
|
||||
The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling.
|
||||
|
||||
|
||||
```python
|
||||
processed_chat = processor.apply_chat_template(
|
||||
messages,
|
||||
@ -265,4 +262,3 @@ print(processed_chat.keys())
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templates/) template stored in the tokenizer's [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax.
|
||||
|
||||
|
||||
```jinja
|
||||
{%- for message in messages %}
|
||||
{{- '<|' + message['role'] + |>\n' }}
|
||||
@ -108,7 +107,6 @@ We strongly recommend using `-` to ensure only the intended content is printed.
|
||||
|
||||
### Special variables and callables
|
||||
|
||||
|
||||
The only constants in a template are the `messages` variable and the `add_generation_prompt` boolean. However, you have
|
||||
access to **any other keyword arguments that are passed** to the [`~PreTrainedTokenizerBase.apply_chat_template`] method.
|
||||
|
||||
@ -133,7 +131,7 @@ Make the changes below to ensure compatibility across all Jinja implementations.
|
||||
|
||||
### Big templates
|
||||
|
||||
Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
|
||||
Newer models or models with features like [tool-calling](./chat_extras) and RAG require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
|
||||
|
||||
Write the template in a separate file and extract it to the chat template.
|
||||
|
||||
@ -190,7 +188,7 @@ The example below shows how a tool is defined in JSON schema format.
|
||||
|
||||
An example of handling tool definitions in a chat template is shown below. The specific tokens and layouts should be changed to match the ones the model was trained with.
|
||||
|
||||
```
|
||||
```jinja
|
||||
{%- if tools %}
|
||||
{%- for tool in tools %}
|
||||
{{- '<tool>' + tool['function']['name'] + '\n' }}
|
||||
@ -228,7 +226,7 @@ Tool calls are generally passed in the `tool_calls` key of an `"assistant”` me
|
||||
|
||||
A common pattern for handling tool calls is shown below. You can use this as a starting point, but make sure you template actually matches the format the model was trained with!
|
||||
|
||||
```
|
||||
```jinja
|
||||
{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
|
||||
{%- for tool_call in message['tool_calls'] %}
|
||||
{{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
|
||||
@ -251,7 +249,7 @@ Tool responses are message dicts with the `tool` role. They are much simpler tha
|
||||
|
||||
Some templates may not even need the `name` key, in which case, you can write your template to only read the `content` key.
|
||||
|
||||
```
|
||||
```jinja
|
||||
{%- if message['role'] == 'tool' %}
|
||||
{{- "<tool_result>" + message['content'] + "</tool_result>" }}
|
||||
{%- endif %}
|
||||
|
@ -48,7 +48,6 @@ transformers chat -h
|
||||
|
||||
The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).
|
||||
|
||||
|
||||
## TextGenerationPipeline
|
||||
|
||||
[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
|
||||
|
@ -21,9 +21,10 @@ where `port` is the port used by `transformers serve` (`8000` by default). On th
|
||||
</h3>
|
||||
|
||||
You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
|
||||
|
||||
1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
|
||||
2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
|
||||
3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
|
||||
3. Add some random text to OpenAI API Key. This field won't be used, but it can't be empty;
|
||||
4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
|
||||
5. Hit "Verify".
|
||||
|
||||
@ -38,5 +39,3 @@ You are now ready to use your local model in Cursor! For instance, if you toggle
|
||||
<h3 align="center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
|
||||
</h3>
|
||||
|
||||
|
||||
|
@ -35,7 +35,7 @@ pip install deepspeed
|
||||
|
||||
PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere.
|
||||
|
||||
The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.
|
||||
The exact location can vary from system to system, but `/usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.
|
||||
|
||||
```bash
|
||||
which nvcc
|
||||
@ -45,7 +45,7 @@ which nvcc
|
||||
|
||||
You may also have more than one CUDA toolkit installed on your system.
|
||||
|
||||
```bash
|
||||
```text
|
||||
/usr/local/cuda-10.2
|
||||
/usr/local/cuda-11.0
|
||||
```
|
||||
|
@ -294,7 +294,7 @@ Consider running a [benchmark](https://github.com/microsoft/DeepSpeed/issues/998
|
||||
|
||||
The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to `auto`, but you can also manually set configure these values.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
@ -383,7 +383,7 @@ Gradient checkpointing saves memory by only storing *some* of the intermediate a
|
||||
|
||||
The batch size can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` and `train_batch_size` to the value of `world_size * per_device_train_batch_size * gradient_accumulation_steps`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"train_batch_size": "auto"
|
||||
@ -400,7 +400,7 @@ Reduce operations are lossy, for example, when gradients are averaged across mul
|
||||
|
||||
Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"communication_data_type": "fp32"
|
||||
}
|
||||
@ -412,7 +412,7 @@ Gradient accumulation accumulates gradients over several mini-batches of data be
|
||||
|
||||
Gradient accumulation can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `gradient_accumulation_steps`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"gradient_accumulation_steps": "auto"
|
||||
}
|
||||
@ -424,7 +424,7 @@ Gradient clipping is useful for preventing exploding gradients which can lead to
|
||||
|
||||
Gradient clipping can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `max_grad_norm`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"gradient_clipping": "auto"
|
||||
}
|
||||
@ -439,7 +439,7 @@ Mixed precision accelerates training speed by performing some calculations in ha
|
||||
|
||||
Train in fp32 if a model wasn't pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": false
|
||||
@ -454,7 +454,7 @@ For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/
|
||||
|
||||
To configure AMP-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically enables or disables fp16 based on the value of `fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
@ -471,7 +471,7 @@ For additional DeepSpeed fp16 training options, take a look at the [FP16 Trainin
|
||||
|
||||
To configure Apex-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configures `amp` based on the values of `fp16_backend` and `fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"amp": {
|
||||
"enabled": "auto",
|
||||
@ -486,11 +486,11 @@ To configure Apex-like fp16 mixed precision, set up the config as shown below wi
|
||||
> [!TIP]
|
||||
> bf16 requires DeepSpeed 0.6.0.
|
||||
|
||||
bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation.
|
||||
bf16 has the same dynamic range as fp32, and doesn't require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation.
|
||||
|
||||
bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
@ -514,7 +514,7 @@ DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/
|
||||
|
||||
You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
@ -530,7 +530,7 @@ You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
Use an unsupported optimizer by adding the following to the top level configuration.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"zero_allow_untested_optimizer": true
|
||||
}
|
||||
@ -538,7 +538,7 @@ Use an unsupported optimizer by adding the following to the top level configurat
|
||||
|
||||
From DeepSpeed 0.8.3+, if you want to use offload, you'll also need to add the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"zero_force_ds_cpu_optimizer": false
|
||||
}
|
||||
@ -558,7 +558,7 @@ If you don't configure the scheduler in the config file, [`Trainer`] automatical
|
||||
|
||||
You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
@ -581,7 +581,7 @@ You can set the parameters to `"auto"` or manually input your own values.
|
||||
|
||||
Resume training with a Universal checkpoint by setting `load_universal` to `true` in the config file.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"checkpoint": {
|
||||
"load_universal": true
|
||||
@ -640,7 +640,7 @@ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
|
||||
|
||||
A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a [checkpoint](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"checkpoint": {
|
||||
"use_node_local_storage": true
|
||||
@ -824,7 +824,7 @@ ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3,
|
||||
|
||||
If you don't, [`Trainer`] won't save the weights in fp16 and won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
@ -986,7 +986,7 @@ NaN loss often occurs when a model is pretrained in bf16 and you try to use it w
|
||||
|
||||
It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs.
|
||||
|
||||
```yaml
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
|
@ -226,7 +226,7 @@ tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
|
||||
|
||||
<Youtube id="Yffk5aydLzg"/>
|
||||
|
||||
A Transformers model expects the input to be a PyTorch or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
|
||||
A Transformers model expects the input to be a PyTorch or NumPy tensor. A tokenizer's job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
|
||||
|
||||
```py
|
||||
from transformers import AutoTokenizer
|
||||
|
@ -229,6 +229,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
## Custom generation methods
|
||||
|
||||
Custom generation methods enable specialized behavior such as:
|
||||
|
||||
- have the model continue thinking if it is uncertain;
|
||||
- roll back generation if the model gets stuck;
|
||||
- handle special tokens with custom logic;
|
||||
@ -289,7 +290,7 @@ print(tokenizer.batch_decode(gen_out)[0])
|
||||
|
||||
If the custom method has pinned Python requirements that your environment doesn't meet, you'll get an exception about missing requirements. For instance, [transformers-community/custom_generate_bad_requirements](https://huggingface.co/transformers-community/custom_generate_bad_requirements) has an impossible set of requirements defined in its `custom_generate/requirements.txt` file, and you'll see the error message below if you try to run it.
|
||||
|
||||
```
|
||||
```text
|
||||
ImportError: Missing requirements in your local environment for `transformers-community/custom_generate_bad_requirements`:
|
||||
foo (installed: None)
|
||||
bar==0.0.0 (installed: None)
|
||||
@ -301,6 +302,7 @@ Updating your Python requirements accordingly will remove this error message.
|
||||
### Creating a custom generation method
|
||||
|
||||
To create a new generation method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
|
||||
|
||||
1. The model you've designed your generation method with.
|
||||
2. `custom_generate/generate.py`, which contains all the logic for your custom generation method.
|
||||
3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method.
|
||||
@ -308,7 +310,7 @@ To create a new generation method, you need to create a new [**Model**](https://
|
||||
|
||||
After you've added all required files, your repository should look like this
|
||||
|
||||
```
|
||||
```text
|
||||
your_repo/
|
||||
├── README.md # include the 'custom_generate' tag
|
||||
├── config.json
|
||||
@ -377,6 +379,7 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar
|
||||
```
|
||||
|
||||
Follow the recommended practices below to ensure your custom generation method works as expected.
|
||||
|
||||
- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
|
||||
- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
|
||||
- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
|
||||
@ -389,7 +392,6 @@ from .utils import some_function
|
||||
|
||||
Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom generation method.
|
||||
|
||||
|
||||
#### requirements.txt
|
||||
|
||||
You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
|
||||
@ -400,7 +402,7 @@ The root level `README.md` in the model repository usually describes the model t
|
||||
|
||||
For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository!
|
||||
|
||||
```
|
||||
```text
|
||||
---
|
||||
library_name: transformers
|
||||
tags:
|
||||
@ -411,13 +413,14 @@ tags:
|
||||
```
|
||||
|
||||
Recommended practices:
|
||||
|
||||
- Document input and output differences in [`~GenerationMixin.generate`].
|
||||
- Add self-contained examples to enable quick experimentation.
|
||||
- Describe soft-requirements such as if the method only works well with a certain family of models.
|
||||
|
||||
### Reusing `generate`’s input preparation
|
||||
### Reusing `generate`'s input preparation
|
||||
|
||||
If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]’s full preparation pipeline while overriding only the decoding loop.
|
||||
If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]'s full preparation pipeline while overriding only the decoding loop.
|
||||
|
||||
```py
|
||||
def custom_loop(model, input_ids, attention_mask, logits_processor, stopping_criteria, generation_config, **model_kwargs):
|
||||
@ -438,11 +441,12 @@ output = model.generate(
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers’ built-in input preparation logic.
|
||||
> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers' built-in input preparation logic.
|
||||
|
||||
### Finding custom generation methods
|
||||
|
||||
You can find all custom generation methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`. In addition to the tag, we curate two collections of `custom_generate` methods:
|
||||
|
||||
- [Custom generation methods - Community](https://huggingface.co/collections/transformers-community/custom-generation-methods-community-6888fb1da0efbc592d3a8ab6) -- a collection of powerful methods contributed by the community;
|
||||
- [Custom generation methods - Tutorials](https://huggingface.co/collections/transformers-community/custom-generation-methods-tutorials-6823589657a94940ea02cfec) -- a collection of reference implementations for methods that previously were part of `transformers`, as well as tutorials for `custom_generate`.
|
||||
|
||||
|
@ -185,9 +185,9 @@ See the [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/
|
||||
|
||||
The model head refers to the last layer of a neural network that accepts the raw hidden states and projects them onto a different dimension. There is a different model head for each task. For example:
|
||||
|
||||
* [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`].
|
||||
* [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`].
|
||||
* [`Wav2Vec2ForCTC`] is a language modeling head with [CTC](#connectionist-temporal-classification-ctc) on top of the base [`Wav2Vec2Model`].
|
||||
* [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`].
|
||||
* [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`].
|
||||
* [`Wav2Vec2ForCTC`] is a language modeling head with [CTC](#connectionist-temporal-classification-ctc) on top of the base [`Wav2Vec2Model`].
|
||||
|
||||
## I
|
||||
|
||||
|
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
|
||||
</h3>
|
||||
|
||||
|
||||
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
|
||||
vision, audio, video, and multimodal model, for both inference and training.
|
||||
|
||||
@ -35,6 +34,10 @@ There are over 1M+ Transformers [model checkpoints](https://huggingface.co/model
|
||||
|
||||
Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away.
|
||||
|
||||
Explore the [Models Timeline](./models_timeline) to discover the latest text, vision, audio and multimodal model architectures in Transformers.
|
||||
|
||||
|
||||
|
||||
## Features
|
||||
|
||||
Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of the main features include:
|
||||
|
@ -20,7 +20,6 @@ This page lists all of Transformers general utility functions that are found in
|
||||
|
||||
Most of those are only useful if you are studying the general code in the library.
|
||||
|
||||
|
||||
## Enums and namedtuples
|
||||
|
||||
[[autodoc]] utils.ExplicitEnum
|
||||
|
@ -65,7 +65,6 @@ values. Here, for instance, it has two keys that are `sequences` and `scores`.
|
||||
|
||||
We document here all output types.
|
||||
|
||||
|
||||
[[autodoc]] generation.GenerateDecoderOnlyOutput
|
||||
|
||||
[[autodoc]] generation.GenerateEncoderDecoderOutput
|
||||
@ -74,13 +73,11 @@ We document here all output types.
|
||||
|
||||
[[autodoc]] generation.GenerateBeamEncoderDecoderOutput
|
||||
|
||||
|
||||
## LogitsProcessor
|
||||
|
||||
A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
|
||||
generation.
|
||||
|
||||
|
||||
[[autodoc]] AlternatingCodebooksLogitsProcessor
|
||||
- __call__
|
||||
|
||||
@ -174,8 +171,6 @@ generation.
|
||||
[[autodoc]] WatermarkLogitsProcessor
|
||||
- __call__
|
||||
|
||||
|
||||
|
||||
## StoppingCriteria
|
||||
|
||||
A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token). Please note that this is exclusively available to our PyTorch implementations.
|
||||
@ -300,7 +295,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
|
||||
- to_legacy_cache
|
||||
- from_legacy_cache
|
||||
|
||||
|
||||
## Watermark Utils
|
||||
|
||||
[[autodoc]] WatermarkingConfig
|
||||
|
@ -21,10 +21,8 @@ provides for it.
|
||||
|
||||
Most of those are only useful if you are adding new models in the library.
|
||||
|
||||
|
||||
## Model addition debuggers
|
||||
|
||||
|
||||
### Model addition debugger - context manager for model adders
|
||||
|
||||
This context manager is a power user tool intended for model adders. It tracks all forward calls within a model forward
|
||||
@ -72,7 +70,6 @@ with model_addition_debugger_context(
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Reading results
|
||||
|
||||
The debugger generates two files from the forward call, both with the same base name, but ending either with
|
||||
@ -221,9 +218,9 @@ path reference to the associated `.safetensors` file. Each tensor is written to
|
||||
the state dictionary. File names are constructed using the `module_path` as a prefix with a few possible postfixes that
|
||||
are built recursively.
|
||||
|
||||
* Module inputs are denoted with the `_inputs` and outputs by `_outputs`.
|
||||
* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`.
|
||||
* `dict` instances will be postfixed with `_{key}`.
|
||||
* Module inputs are denoted with the `_inputs` and outputs by `_outputs`.
|
||||
* `list` and `tuple` instances, such as `args` or function return values, will be postfixed with `_{index}`.
|
||||
* `dict` instances will be postfixed with `_{key}`.
|
||||
|
||||
### Comparing between implementations
|
||||
|
||||
@ -231,10 +228,8 @@ Once the forward passes of two models have been traced by the debugger, one can
|
||||
below: we can see slight differences between these two implementations' key projection layer. Inputs are mostly
|
||||
identical, but not quite. Looking through the file differences makes it easier to pinpoint which layer is wrong.
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
### Limitations and scope
|
||||
|
||||
This feature will only work for torch-based models, and would require more work and case-by-case approach for say
|
||||
@ -261,6 +256,7 @@ how many tests are being skipped and for which models.
|
||||
When porting models to transformers, tests fail as they should, and sometimes `test_modeling_common` feels irreconcilable with the peculiarities of our brand new model. But how can we be sure we're not breaking everything by adding a seemingly innocent skip?
|
||||
|
||||
This utility:
|
||||
|
||||
- scans all test_modeling_common methods
|
||||
- looks for times where a method is skipped
|
||||
- returns a summary json you can load as a DataFrame/inspect
|
||||
@ -269,7 +265,6 @@ This utility:
|
||||
|
||||

|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
You can run the skipped test analyzer in two ways:
|
||||
@ -286,7 +281,7 @@ python utils/scan_skipped_tests.py --output_dir path/to/output
|
||||
|
||||
**Example output:**
|
||||
|
||||
```
|
||||
```text
|
||||
🔬 Parsing 331 model test files once each...
|
||||
📝 Aggregating 224 tests...
|
||||
(224/224) test_update_candidate_strategy_with_matches_1es_3d_is_nonecodet_schedule_fa_kwargs
|
||||
|
@ -20,7 +20,6 @@ This page lists all the utility functions the library provides for pipelines.
|
||||
|
||||
Most of those are only useful if you are studying the code of the models in the library.
|
||||
|
||||
|
||||
## Argument handling
|
||||
|
||||
[[autodoc]] pipelines.ArgumentHandler
|
||||
|
@ -25,7 +25,7 @@ You are now ready to chat!
|
||||
|
||||
To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
|
||||
|
||||
```
|
||||
```bash
|
||||
ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
|
||||
```
|
||||
|
||||
|
@ -213,7 +213,7 @@ A cache can also work in iterative generation settings where there is back-and-f
|
||||
|
||||
For iterative generation with a cache, start by initializing an empty cache class and then you can feed in your new prompts. Keep track of dialogue history with a [chat template](./chat_templating).
|
||||
|
||||
The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you’re using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written.
|
||||
The following example demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). If you're using a different chat-style model, [`~PreTrainedTokenizer.apply_chat_template`] may process messages differently. It might cut out important tokens depending on how the Jinja template is written.
|
||||
|
||||
For example, some models use special `<think> ... </think>` tokens during reasoning. These could get lost during re-encoding, causing indexing issues. You might need to manually remove or adjust extra tokens from the completions to keep things stable.
|
||||
|
||||
|
@ -35,6 +35,7 @@ Before you begin, it's helpful to install [bitsandbytes](https://hf.co/docs/bits
|
||||
```bash
|
||||
!pip install -U transformers bitsandbytes
|
||||
```
|
||||
|
||||
Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more.
|
||||
|
||||
Load a LLM with [`~PreTrainedModel.from_pretrained`] and add the following two parameters to reduce the memory requirements.
|
||||
@ -92,6 +93,7 @@ model.generate(**inputs, num_beams=4, do_sample=True)
|
||||
```
|
||||
|
||||
[`~GenerationMixin.generate`] can also be extended with external libraries or custom code:
|
||||
|
||||
1. the `logits_processor` parameter accepts custom [`LogitsProcessor`] instances for manipulating the next token probability distribution;
|
||||
2. the `stopping_criteria` parameters supports custom [`StoppingCriteria`] to stop text generation;
|
||||
3. other custom generation methods can be loaded through the `custom_generate` flag ([docs](generation_strategies.md/#custom-decoding-methods)).
|
||||
@ -154,7 +156,6 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
||||
| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
|
||||
| `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
|
||||
|
||||
|
||||
## Pitfalls
|
||||
|
||||
The section below covers some common issues you may encounter during text generation and how to solve them.
|
||||
|
@ -66,6 +66,7 @@ If you have access to an 8 x 80GB A100 node, you could load BLOOM as follows
|
||||
```bash
|
||||
!pip install transformers accelerate bitsandbytes optimum
|
||||
```
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
@ -98,7 +99,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
|
||||
```
|
||||
|
||||
@ -116,7 +118,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```bash
|
||||
|
||||
```text
|
||||
29.0260648727417
|
||||
```
|
||||
|
||||
@ -127,7 +130,6 @@ Note that if we had tried to run the model in full float32 precision, a whopping
|
||||
|
||||
If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under `"dtype"`, *e.g.* [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). It is recommended to set the model to the same precision type as written in the config when loading with `from_pretrained(..., dtype=...)` except when the original type is float32 in which case one can use both `float16` or `bfloat16` for inference.
|
||||
|
||||
|
||||
Let's define a `flush(...)` function to free all allocated memory so that we can accurately measure the peak allocated GPU memory.
|
||||
|
||||
```python
|
||||
@ -148,6 +150,7 @@ Let's call it now for the next experiment.
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
|
||||
From the Accelerate library, you can also use a device-agnostic utility method called [release_memory](https://github.com/huggingface/accelerate/blob/29be4788629b772a3b722076e433b5b3b5c85da3/src/accelerate/utils/memory.py#L63), which takes various hardware backends like XPU, MLU, NPU, MPS, and more into account.
|
||||
|
||||
```python
|
||||
@ -204,7 +207,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
|
||||
```
|
||||
|
||||
@ -215,15 +219,16 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
15.219234466552734
|
||||
```
|
||||
|
||||
Significantly less! We're down to just a bit over 15 GBs and could therefore run this model on consumer GPUs like the 4090.
|
||||
We're seeing a very nice gain in memory efficiency and more or less no degradation to the model's output. However, we can also notice a slight slow-down during inference.
|
||||
|
||||
|
||||
We delete the models and flush the memory again.
|
||||
|
||||
```python
|
||||
del model
|
||||
del pipe
|
||||
@ -245,7 +250,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```\ndef bytes_to_gigabytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single argument
|
||||
```
|
||||
|
||||
@ -256,7 +262,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
9.543574333190918
|
||||
```
|
||||
|
||||
@ -270,6 +277,7 @@ Also note that inference here was again a bit slower compared to 8-bit quantizat
|
||||
del model
|
||||
del pipe
|
||||
```
|
||||
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
@ -384,6 +392,7 @@ def alternating(list1, list2):
|
||||
-----
|
||||
"""
|
||||
```
|
||||
|
||||
For demonstration purposes, we duplicate the system prompt by ten so that the input length is long enough to observe Flash Attention's memory savings.
|
||||
We append the original text prompt `"Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"`
|
||||
|
||||
@ -413,7 +422,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Generated in 10.96854019165039 seconds.
|
||||
Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
|
||||
````
|
||||
@ -429,7 +439,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```bash
|
||||
|
||||
```text
|
||||
37.668193340301514
|
||||
```
|
||||
|
||||
@ -460,7 +471,8 @@ result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
Generated in 3.0211617946624756 seconds.
|
||||
Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
|
||||
```
|
||||
@ -474,7 +486,8 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
32.617331981658936
|
||||
```
|
||||
|
||||
@ -604,7 +617,8 @@ generated_text
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
shape of input_ids torch.Size([1, 21])
|
||||
shape of input_ids torch.Size([1, 22])
|
||||
shape of input_ids torch.Size([1, 23])
|
||||
@ -641,7 +655,8 @@ generated_text
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
length of key-value cache 20
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
@ -675,7 +690,7 @@ Note that, despite our advice to use key-value caches, your LLM output may be sl
|
||||
|
||||
The key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. Let's look at an example.
|
||||
|
||||
```
|
||||
```text
|
||||
User: How many people live in France?
|
||||
Assistant: Roughly 75 million people live in France
|
||||
User: And how many are in Germany?
|
||||
@ -712,7 +727,8 @@ tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):]
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
is a modified version of the function that returns Mega bytes instead.
|
||||
|
||||
def bytes_to_megabytes(bytes):
|
||||
@ -733,7 +749,8 @@ config = model.config
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
|
||||
```text
|
||||
7864320000
|
||||
```
|
||||
|
||||
@ -773,7 +790,6 @@ The most notable application of GQA is [Llama-v2](https://huggingface.co/meta-ll
|
||||
|
||||
> As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat.
|
||||
|
||||
|
||||
## Conclusion
|
||||
|
||||
The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. As an example, one such promising research direction is [speculative decoding](https://huggingface.co/papers/2211.17192) where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. Going into more detail is out of the scope of this notebook, but can be read upon in this [nice blog post](https://huggingface.co/blog/assisted-generation).
|
||||
|
@ -54,7 +54,6 @@ The main class that implements callbacks is [`TrainerCallback`]. It gets the
|
||||
Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
|
||||
[`TrainerControl`].
|
||||
|
||||
|
||||
## Available Callbacks
|
||||
|
||||
Here is the list of the available [`TrainerCallback`] in the library:
|
||||
|
@ -24,7 +24,6 @@ Each derived config class implements model specific attributes. Common attribute
|
||||
`hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
|
||||
`vocab_size`.
|
||||
|
||||
|
||||
## PretrainedConfig
|
||||
|
||||
[[autodoc]] PretrainedConfig
|
||||
|
@ -25,7 +25,6 @@ on the formed batch.
|
||||
|
||||
Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
|
||||
|
||||
|
||||
## Default data collator
|
||||
|
||||
[[autodoc]] data.data_collator.default_data_collator
|
||||
|
@ -15,14 +15,12 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
|
||||
# ExecuTorch
|
||||
|
||||
[`ExecuTorch`](https://github.com/pytorch/executorch) is an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers. It is part of the PyTorch ecosystem and supports the deployment of PyTorch models with a focus on portability, productivity, and performance.
|
||||
|
||||
ExecuTorch introduces well defined entry points to perform model, device, and/or use-case specific optimizations such as backend delegation, user-defined compiler transformations, memory planning, and more. The first step in preparing a PyTorch model for execution on an edge device using ExecuTorch is to export the model. This is achieved through the use of a PyTorch API called [`torch.export`](https://pytorch.org/docs/stable/export.html).
|
||||
|
||||
|
||||
## ExecuTorch Integration
|
||||
|
||||
An integration point is being developed to ensure that 🤗 Transformers can be exported using `torch.export`. The goal of this integration is not only to enable export but also to ensure that the exported artifact can be further lowered and optimized to run efficiently in `ExecuTorch`, particularly for mobile and edge use cases.
|
||||
|
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction from sequences, e.g., pre-processing audio files to generate Log-Mel Spectrogram features, feature extraction from images, e.g., cropping image files, but also padding, normalization, and conversion to NumPy and PyTorch tensors.
|
||||
|
||||
|
||||
## FeatureExtractionMixin
|
||||
|
||||
[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
|
||||
|
@ -26,6 +26,7 @@ from transformers import AutoImageProcessor
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True)
|
||||
```
|
||||
|
||||
Note that `use_fast` will be set to `True` by default in a future release.
|
||||
|
||||
When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise.
|
||||
@ -57,7 +58,6 @@ Here are some speed comparisons between the base and fast image processors for t
|
||||
|
||||
These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU.
|
||||
|
||||
|
||||
## ImageProcessingMixin
|
||||
|
||||
[[autodoc]] image_processing_utils.ImageProcessingMixin
|
||||
@ -72,7 +72,6 @@ These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon
|
||||
|
||||
[[autodoc]] image_processing_utils.BaseImageProcessor
|
||||
|
||||
|
||||
## BaseImageProcessorFast
|
||||
|
||||
[[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
|
||||
|
@ -55,7 +55,6 @@ logger.info("INFO")
|
||||
logger.warning("WARN")
|
||||
```
|
||||
|
||||
|
||||
All the methods of this logging module are documented below, the main ones are
|
||||
[`logging.get_verbosity`] to get the current level of verbosity in the logger and
|
||||
[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
|
||||
@ -81,6 +80,7 @@ We use both in the `transformers` library. We leverage and adapt `logging`'s `ca
|
||||
management of these warning messages by the verbosity setters above.
|
||||
|
||||
What does that mean for developers of the library? We should respect the following heuristics:
|
||||
|
||||
- `warnings` should be favored for developers of the library and libraries dependent on `transformers`
|
||||
- `logging` should be used for end-users of the library using it in every-day projects
|
||||
|
||||
|
@ -26,7 +26,6 @@ file or directory, or from a pretrained model configuration provided by the libr
|
||||
|
||||
The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`] and [`~generation.GenerationMixin`].
|
||||
|
||||
|
||||
## PreTrainedModel
|
||||
|
||||
[[autodoc]] PreTrainedModel
|
||||
|
@ -51,4 +51,3 @@ to export models for different types of topologies or tasks.
|
||||
### FeaturesManager
|
||||
|
||||
[[autodoc]] onnx.features.FeaturesManager
|
||||
|
||||
|
@ -22,7 +22,6 @@ The `.optimization` module provides:
|
||||
- several schedules in the form of schedule objects that inherit from `_LRSchedule`:
|
||||
- a gradient accumulation class to accumulate the gradients of multiple batches
|
||||
|
||||
|
||||
## AdaFactor
|
||||
|
||||
[[autodoc]] Adafactor
|
||||
|
@ -47,7 +47,6 @@ However, this is not always the case. Some models apply normalization or subsequ
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
|
||||
will get `None`. Here for instance `outputs.loss` is the loss computed by the model, and `outputs.attentions` is
|
||||
`None`.
|
||||
|
@ -81,7 +81,6 @@ for out in tqdm(pipe(KeyDataset(dataset, "file"))):
|
||||
|
||||
For ease of use, a generator is also possible:
|
||||
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
@ -160,7 +159,7 @@ for batch_size in [1, 8, 64, 256]:
|
||||
pass
|
||||
```
|
||||
|
||||
```
|
||||
```text
|
||||
# On GTX 970
|
||||
------------------------------
|
||||
Streaming no batching
|
||||
@ -196,8 +195,7 @@ This is a occasional very long sentence compared to the other. In that case, the
|
||||
tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
|
||||
bigger batches, the program simply crashes.
|
||||
|
||||
|
||||
```
|
||||
```text
|
||||
------------------------------
|
||||
Streaming no batching
|
||||
100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
|
||||
@ -245,7 +243,6 @@ multiple forward pass of a model. Under normal circumstances, this would yield i
|
||||
In order to circumvent this issue, both of these pipelines are a bit specific, they are `ChunkPipeline` instead of
|
||||
regular `Pipeline`. In short:
|
||||
|
||||
|
||||
```python
|
||||
preprocessed = pipe.preprocess(inputs)
|
||||
model_outputs = pipe.forward(preprocessed)
|
||||
@ -254,7 +251,6 @@ outputs = pipe.postprocess(model_outputs)
|
||||
|
||||
Now becomes:
|
||||
|
||||
|
||||
```python
|
||||
all_model_outputs = []
|
||||
for preprocessed in pipe.preprocess(inputs):
|
||||
@ -282,7 +278,6 @@ If you want to override a specific pipeline.
|
||||
Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
|
||||
cases, so `transformers` could maybe support your use case.
|
||||
|
||||
|
||||
If you want to try simply you can:
|
||||
|
||||
- Subclass your pipeline of choice
|
||||
@ -302,7 +297,6 @@ my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
|
||||
|
||||
That should enable you to do all the custom code you want.
|
||||
|
||||
|
||||
## Implementing a pipeline
|
||||
|
||||
[Implementing a new pipeline](../add_new_pipeline)
|
||||
@ -329,7 +323,6 @@ Pipelines available for audio tasks include the following.
|
||||
- __call__
|
||||
- all
|
||||
|
||||
|
||||
### ZeroShotAudioClassificationPipeline
|
||||
|
||||
[[autodoc]] ZeroShotAudioClassificationPipeline
|
||||
|
@ -17,6 +17,7 @@ rendered properly in your Markdown viewer.
|
||||
# Processors
|
||||
|
||||
Processors can mean two different things in the Transformers library:
|
||||
|
||||
- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
|
||||
or [CLIP](../model_doc/clip) (text and vision)
|
||||
- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
|
||||
@ -71,7 +72,6 @@ Additionally, the following method can be used to load values from a data file a
|
||||
|
||||
[[autodoc]] data.processors.glue.glue_convert_examples_to_features
|
||||
|
||||
|
||||
## XNLI
|
||||
|
||||
[The Cross-Lingual NLI Corpus (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) is a benchmark that evaluates the
|
||||
@ -88,7 +88,6 @@ Please note that since the gold labels are available on the test set, evaluation
|
||||
|
||||
An example using these processors is given in the [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_xnli.py) script.
|
||||
|
||||
|
||||
## SQuAD
|
||||
|
||||
[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) is a benchmark that
|
||||
@ -115,11 +114,9 @@ Additionally, the following method can be used to convert SQuAD examples into
|
||||
|
||||
[[autodoc]] data.processors.squad.squad_convert_examples_to_features
|
||||
|
||||
|
||||
These processors as well as the aforementioned method can be used with files containing the data as well as with the
|
||||
*tensorflow_datasets* package. Examples are given below.
|
||||
|
||||
|
||||
### Example usage
|
||||
|
||||
Here is an example using the processors as well as the conversion method using data files:
|
||||
|
@ -30,15 +30,15 @@ like token streaming.
|
||||
## GenerationConfig
|
||||
|
||||
[[autodoc]] generation.GenerationConfig
|
||||
- from_pretrained
|
||||
- from_model_config
|
||||
- save_pretrained
|
||||
- update
|
||||
- validate
|
||||
- get_generation_mode
|
||||
- from_pretrained
|
||||
- from_model_config
|
||||
- save_pretrained
|
||||
- update
|
||||
- validate
|
||||
- get_generation_mode
|
||||
|
||||
## GenerationMixin
|
||||
|
||||
[[autodoc]] GenerationMixin
|
||||
- generate
|
||||
- compute_transition_scores
|
||||
- generate
|
||||
- compute_transition_scores
|
||||
|
@ -50,7 +50,6 @@ several advanced alignment methods which can be used to map between the original
|
||||
token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
|
||||
to a given token).
|
||||
|
||||
|
||||
# Multimodal Tokenizer
|
||||
|
||||
Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens
|
||||
|
@ -22,7 +22,6 @@ The video processor extends the functionality of image processors by allowing Vi
|
||||
|
||||
When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't updated your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
|
||||
|
||||
|
||||
### Usage Example
|
||||
Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
|
||||
|
||||
@ -59,7 +58,6 @@ The video processor can also sample video frames using the technique best suited
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
```python
|
||||
from transformers import AutoVideoProcessor
|
||||
|
||||
@ -92,4 +90,3 @@ print(processed_video_inputs.pixel_values_videos.shape)
|
||||
## BaseVideoProcessor
|
||||
|
||||
[[autodoc]] video_processing_utils.BaseVideoProcessor
|
||||
|
||||
|
@ -25,7 +25,6 @@ The abstract from the paper is the following:
|
||||
|
||||
*We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.*
|
||||
|
||||
|
||||
This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
|
||||
The original code can be found [here](https://github.com/apple/ml-aim).
|
||||
|
||||
|
@ -148,6 +148,7 @@ for label, score in zip(candidate_labels, probs):
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- Refer to the [Kakao Brain’s Open Source ViT, ALIGN, and the New COYO Text-Image Dataset](https://huggingface.co/blog/vit-align) blog post for more details.
|
||||
|
||||
## AlignConfig
|
||||
|
@ -142,7 +142,6 @@ response = processor.decode(output_ids, skip_special_tokens=True)
|
||||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## AriaImageProcessor
|
||||
|
||||
[[autodoc]] AriaImageProcessor
|
||||
|
@ -61,7 +61,7 @@ page for more information.
|
||||
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
|
||||
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
|
||||
|
||||
```
|
||||
```py
|
||||
from transformers import ASTForAudioClassification
|
||||
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", attn_implementation="sdpa", dtype=torch.float16)
|
||||
...
|
||||
|
@ -23,7 +23,6 @@ automatically retrieve the relevant model given the name/path to the pretrained
|
||||
Instantiating one of [`AutoConfig`], [`AutoModel`], and
|
||||
[`AutoTokenizer`] will directly create a class of the relevant architecture. For instance
|
||||
|
||||
|
||||
```python
|
||||
model = AutoModel.from_pretrained("google-bert/bert-base-cased")
|
||||
```
|
||||
|
@ -86,7 +86,6 @@ Next, [install](https://github.com/Dao-AILab/flash-attention#installation-and-fe
|
||||
pip install -U flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
|
||||
##### Usage
|
||||
|
||||
To load a model using Flash Attention 2, we can pass the `attn_implementation="flash_attention_2"` flag to [`.from_pretrained`](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). We'll also load the model in half-precision (e.g. `torch.float16`), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference:
|
||||
@ -97,7 +96,6 @@ model = BarkModel.from_pretrained("suno/bark-small", dtype=torch.float16, attn_i
|
||||
|
||||
##### Performance comparison
|
||||
|
||||
|
||||
The following diagram shows the latency for the native attention implementation (no optimisation) against Better Transformer and Flash Attention 2. In all cases, we generate 400 semantic tokens on a 40GB A100 GPU with PyTorch 2.1. Flash Attention 2 is also consistently faster than Better Transformer, and its performance improves even more as batch sizes increase:
|
||||
|
||||
<div style="text-align: center">
|
||||
@ -108,7 +106,6 @@ To put this into perspective, on an NVIDIA A100 and when generating 400 semantic
|
||||
|
||||
At batch size 8, on an NVIDIA A100, Flash Attention 2 is also 10% faster than Better Transformer, and at batch size 16, 25%.
|
||||
|
||||
|
||||
#### Combining optimization techniques
|
||||
|
||||
You can combine optimization techniques, and use CPU offload, half-precision and Flash Attention 2 (or 🤗 Better Transformer) all at once.
|
||||
@ -165,7 +162,6 @@ Bark can generate highly realistic, **multilingual** speech as well as other aud
|
||||
|
||||
The model can also produce **nonverbal communications** like laughing, sighing and crying.
|
||||
|
||||
|
||||
```python
|
||||
>>> # Adding non-speech cues to the input text
|
||||
>>> inputs = processor("Hello uh ... [clears throat], my dog is cute [laughter]")
|
||||
@ -235,4 +231,3 @@ To save the audio, simply take the sample rate from the model config and some sc
|
||||
|
||||
[[autodoc]] BarkSemanticConfig
|
||||
- all
|
||||
|
||||
|
@ -15,7 +15,6 @@ rendered properly in your Markdown viewer.
|
||||
-->
|
||||
*This model was released on 2019-10-29 and added to Hugging Face Transformers on 2020-11-16.*
|
||||
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
@ -24,7 +23,7 @@ rendered properly in your Markdown viewer.
|
||||
</div>
|
||||
|
||||
# BART
|
||||
[BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It’s pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language.
|
||||
[BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It's pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language.
|
||||
|
||||
You can find all the original BART checkpoints under the [AI at Meta](https://huggingface.co/facebook?search_models=bart) organization.
|
||||
|
||||
@ -46,6 +45,7 @@ pipeline = pipeline(
|
||||
pipeline("Plants create <mask> through a process known as photosynthesis.")
|
||||
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -89,7 +89,7 @@ echo -e "Plants create <mask> through a process known as photosynthesis." | tran
|
||||
|
||||
- Inputs should be padded on the right because BERT uses absolute position embeddings.
|
||||
- The [facebook/bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn) checkpoint doesn't include `mask_token_id` which means it can't perform mask-filling tasks.
|
||||
- BART doesn’t use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or [`~PreTrainedTokenizerBase.encode`] to get the proper splitting.
|
||||
- BART doesn't use `token_type_ids` for sequence classification. Use [`BartTokenizer`] or [`~PreTrainedTokenizerBase.encode`] to get the proper splitting.
|
||||
- The forward pass of [`BartModel`] creates the `decoder_input_ids` if they're not passed. This can be different from other model APIs, but it is a useful feature for mask-filling tasks.
|
||||
- Model predictions are intended to be identical to the original implementation when `forced_bos_token_id=0`. This only works if the text passed to `fairseq.encode` begins with a space.
|
||||
- [`~GenerationMixin.generate`] should be used for conditional generation tasks like summarization.
|
||||
|
@ -31,7 +31,6 @@ You can find all of the original BARThez checkpoints under the [BARThez](https:/
|
||||
> This model was contributed by [moussakam](https://huggingface.co/moussakam).
|
||||
> Refer to the [BART](./bart) docs for more usage examples.
|
||||
|
||||
|
||||
The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
|
||||
|
||||
<hfoptions id="usage">
|
||||
|
@ -33,12 +33,9 @@ You can find all the original checkpoints under the [VinAI](https://huggingface.
|
||||
|
||||
The example below demonstrates how to summarize text with [`Pipeline`] or the [`AutoModel`] class.
|
||||
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
@ -98,8 +95,6 @@ transformers run --task summarization --model vinai/bartpho-word --device 0
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- BARTpho uses the large architecture of BART with an additional layer-normalization layer on top of the encoder and decoder. The BART-specific classes should be replaced with the mBART-specific classes.
|
||||
|
@ -87,7 +87,7 @@ page for more information.
|
||||
SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
|
||||
`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
|
||||
|
||||
```
|
||||
```py
|
||||
from transformers import BeitForImageClassification
|
||||
model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", dtype=torch.float16)
|
||||
...
|
||||
@ -123,6 +123,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
- See also: [Image classification task guide](../tasks/image_classification)
|
||||
|
||||
**Semantic segmentation**
|
||||
|
||||
- [Semantic segmentation task guide](../tasks/semantic_segmentation)
|
||||
|
||||
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2019-07-29 and added to Hugging Face Transformers on 2020-11-16.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
|
@ -81,7 +81,6 @@ API reference information.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
## BertJapaneseTokenizer
|
||||
|
||||
[[autodoc]] BertJapaneseTokenizer
|
||||
|
@ -24,8 +24,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
## BERTweet
|
||||
|
||||
[BERTweet](https://huggingface.co/papers/2005.10200) shares the same architecture as [BERT-base](./bert), but it’s pretrained like [RoBERTa](./roberta) on English Tweets. It performs really well on Tweet-related tasks like part-of-speech tagging, named entity recognition, and text classification.
|
||||
|
||||
[BERTweet](https://huggingface.co/papers/2005.10200) shares the same architecture as [BERT-base](./bert), but it's pretrained like [RoBERTa](./roberta) on English Tweets. It performs really well on Tweet-related tasks like part-of-speech tagging, named entity recognition, and text classification.
|
||||
|
||||
You can find all the original BERTweet checkpoints under the [VinAI Research](https://huggingface.co/vinai?search_models=BERTweet) organization.
|
||||
|
||||
@ -49,6 +48,7 @@ pipeline = pipeline(
|
||||
)
|
||||
pipeline("Plants create <mask> through a process known as photosynthesis.")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -88,7 +88,8 @@ echo -e "Plants create <mask> through a process known as photosynthesis." | tran
|
||||
</hfoptions>
|
||||
|
||||
## Notes
|
||||
- Use the [`AutoTokenizer`] or [`BertweetTokenizer`] because it’s preloaded with a custom vocabulary adapted to tweet-specific tokens like hashtags (#), mentions (@), emojis, and common abbreviations. Make sure to also install the [emoji](https://pypi.org/project/emoji/) library.
|
||||
|
||||
- Use the [`AutoTokenizer`] or [`BertweetTokenizer`] because it's preloaded with a custom vocabulary adapted to tweet-specific tokens like hashtags (#), mentions (@), emojis, and common abbreviations. Make sure to also install the [emoji](https://pypi.org/project/emoji/) library.
|
||||
- Inputs should be padded on the right (`padding="max_length"`) because BERT uses absolute position embeddings.
|
||||
|
||||
## BertweetTokenizer
|
||||
|
@ -47,6 +47,7 @@ pipeline = pipeline(
|
||||
)
|
||||
pipeline("Plants create [MASK] through a process known as photosynthesis.")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -81,10 +82,12 @@ print(f"The predicted token is: {predicted_token}")
|
||||
```bash
|
||||
!echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google/bigbird-roberta-base --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Notes
|
||||
|
||||
- Inputs should be padded on the right because BigBird uses absolute position embeddings.
|
||||
- BigBird supports `original_full` and `block_sparse` attention. If the input sequence length is less than 1024, it is recommended to use `original_full` since sparse patterns don't offer much benefit for smaller inputs.
|
||||
- The current implementation uses window size of 3 blocks and 2 global blocks, only supports the ITC-implementation, and doesn't support `num_random_blocks=0`.
|
||||
|
@ -52,6 +52,7 @@ Through photosynthesis, plants capture energy from sunlight using a green pigmen
|
||||
These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
|
||||
This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
@ -77,6 +78,7 @@ input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
|
||||
output = model.generate(**input_ids, cache_implementation="static")
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
|
||||
|
@ -135,31 +135,26 @@ print(output)
|
||||
|
||||
[[autodoc]] BioGptConfig
|
||||
|
||||
|
||||
## BioGptTokenizer
|
||||
|
||||
[[autodoc]] BioGptTokenizer
|
||||
- save_vocabulary
|
||||
|
||||
|
||||
## BioGptModel
|
||||
|
||||
[[autodoc]] BioGptModel
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForCausalLM
|
||||
|
||||
[[autodoc]] BioGptForCausalLM
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForTokenClassification
|
||||
|
||||
[[autodoc]] BioGptForTokenClassification
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForSequenceClassification
|
||||
|
||||
[[autodoc]] BioGptForSequenceClassification
|
||||
|
@ -36,6 +36,7 @@ The original code can be found [here](https://github.com/google-research/big_tra
|
||||
## Usage tips
|
||||
|
||||
- BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by [group normalization](https://huggingface.co/papers/1803.08494),
|
||||
|
||||
2) [weight standardization](https://huggingface.co/papers/1903.10520) is used for convolutional layers. The authors show that the combination of both is useful for training with large batch sizes, and has a significant
|
||||
impact on transfer learning.
|
||||
|
||||
|
@ -35,33 +35,29 @@ Several versions of the model weights are available on Hugging Face:
|
||||
|
||||
* [**`microsoft/bitnet-b1.58-2B-4T-gguf`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-gguf): Contains the model weights in GGUF format, compatible with the `bitnet.cpp` library for CPU inference.
|
||||
|
||||
|
||||
### Model Details
|
||||
|
||||
|
||||
* **Architecture:** Transformer-based, modified with `BitLinear` layers (BitNet framework).
|
||||
* Uses Rotary Position Embeddings (RoPE).
|
||||
* Uses squared ReLU (ReLU²) activation in FFN layers.
|
||||
* Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
|
||||
* No bias terms in linear or normalization layers.
|
||||
* Uses Rotary Position Embeddings (RoPE).
|
||||
* Uses squared ReLU (ReLU²) activation in FFN layers.
|
||||
* Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
|
||||
* No bias terms in linear or normalization layers.
|
||||
* **Quantization:** Native 1.58-bit weights and 8-bit activations (W1.58A8).
|
||||
* Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
|
||||
* Activations are quantized to 8-bit integers using absmax quantization (per-token).
|
||||
* **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
|
||||
* Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
|
||||
* Activations are quantized to 8-bit integers using absmax quantization (per-token).
|
||||
* **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
|
||||
* **Parameters:** ~2 Billion
|
||||
* **Training Tokens:** 4 Trillion
|
||||
* **Context Length:** Maximum sequence length of **4096 tokens**.
|
||||
* *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
|
||||
* **Context Length:** Maximum sequence length of **4096 tokens**.
|
||||
* *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
|
||||
* **Training Stages:**
|
||||
1. **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
|
||||
2. **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
|
||||
3. **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
|
||||
1. **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
|
||||
2. **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
|
||||
3. **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
|
||||
* **Tokenizer:** LLaMA 3 Tokenizer (vocab size: 128,256).
|
||||
|
||||
|
||||
## Usage tips
|
||||
|
||||
|
||||
**VERY IMPORTANT NOTE ON EFFICIENCY**
|
||||
|
||||
> Please do NOT expect performance efficiency gains (in terms of speed, latency, or energy consumption) when using this model with the standard transformers library.
|
||||
@ -106,7 +102,6 @@ response = tokenizer.decode(chat_outputs[0][chat_input.shape[-1]:], skip_special
|
||||
print("\nAssistant Response:", response)
|
||||
```
|
||||
|
||||
|
||||
## BitNetConfig
|
||||
|
||||
[[autodoc]] BitNetConfig
|
||||
|
@ -55,7 +55,6 @@ found [here](https://github.com/facebookresearch/ParlAI).
|
||||
Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
|
||||
the left.
|
||||
|
||||
|
||||
## Resources
|
||||
|
||||
- [Causal language modeling task guide](../tasks/language_modeling)
|
||||
|
@ -71,7 +71,6 @@ An example:
|
||||
`facebook/blenderbot_small_90M`, have a different architecture and consequently should be used with
|
||||
[BlenderbotSmall](blenderbot-small).
|
||||
|
||||
|
||||
## Resources
|
||||
|
||||
- [Causal language modeling task guide](../tasks/language_modeling)
|
||||
|
@ -25,7 +25,6 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
[BLIP](https://huggingface.co/papers/2201.12086) (Bootstrapped Language-Image Pretraining) is a vision-language pretraining (VLP) framework designed for *both* understanding and generation tasks. Most existing pretrained models are only good at one or the other. It uses a captioner to generate captions and a filter to remove the noisy captions. This increases training data quality and more effectively uses the messy web data.
|
||||
|
||||
|
||||
You can find all the original BLIP checkpoints under the [BLIP](https://huggingface.co/collections/Salesforce/blip-models-65242f40f1491fbf6a9e9472) collection.
|
||||
|
||||
> [!TIP]
|
||||
@ -129,7 +128,7 @@ Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/exam
|
||||
## BlipTextLMHeadModel
|
||||
|
||||
[[autodoc]] BlipTextLMHeadModel
|
||||
- forward
|
||||
- forward
|
||||
|
||||
## BlipVisionModel
|
||||
|
||||
|
@ -43,17 +43,19 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
- [`BloomForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
|
||||
|
||||
See also:
|
||||
|
||||
- [Causal language modeling task guide](../tasks/language_modeling)
|
||||
- [Text classification task guide](../tasks/sequence_classification)
|
||||
- [Token classification task guide](../tasks/token_classification)
|
||||
- [Question answering task guide](../tasks/question_answering)
|
||||
|
||||
|
||||
⚡️ Inference
|
||||
|
||||
- A blog on [Optimization story: Bloom inference](https://huggingface.co/blog/bloom-inference-optimization).
|
||||
- A blog on [Incredibly Fast BLOOM Inference with DeepSpeed and Accelerate](https://huggingface.co/blog/bloom-inference-pytorch-scripts).
|
||||
|
||||
⚙️ Training
|
||||
|
||||
- A blog on [The Technology Behind BLOOM Training](https://huggingface.co/blog/bloom-megatron-deepspeed).
|
||||
|
||||
## BloomConfig
|
||||
|
97
docs/source/en/model_doc/blt.md
Normal file
97
docs/source/en/model_doc/blt.md
Normal file
@ -0,0 +1,97 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2024-12-13 and added to Hugging Face Transformers on 2025-09-19.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=
|
||||
">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# Byte Lantet Transformer (BLT)
|
||||
|
||||
## Overview
|
||||
|
||||
The BLT model was proposed in [Byte Latent Transformer: Patches Scale Better Than Tokens](https://huggingface.co/papers/2412.09871) by Artidoro Pagnoni, Ram Pasunuru, Pedro Rodriguez, John Nguyen, Benjamin Muller, Margaret Li1, Chunting Zhou, Lili Yu, Jason Weston, Luke Zettlemoyer, Gargi Ghosh, Mike Lewis, Ari Holtzman†, Srinivasan Iyer.
|
||||
BLT is a byte-level LLM that achieves tokenization-level performance through entropy-based dynamic patching.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We introduce the Byte Latent Transformer (BLT), a new byte-level LLM architecture that, for the first time, matches tokenization-based LLM performance at scale with significant improvements in inference
|
||||
efficiency and robustness. BLT encodes bytes into dynamically sized patches, which serve as the primary units of computation. Patches are segmented based on the entropy of the next byte, allocating
|
||||
more compute and model capacity where increased data complexity demands it. We present the first flop controlled scaling study of byte-level models up to 8B parameters and 4T training bytes. Our results demonstrate the feasibility of scaling models trained on raw bytes without a fixed vocabulary. Both training and inference efficiency improve due to dynamically selecting long patches when data is predictable, along with qualitative improvements on reasoning and long tail generalization. Overall, for fixed inference costs, BLT shows significantly better scaling than tokenization-based models, by simultaneously growing both patch and model size.*
|
||||
|
||||
## Usage Tips:
|
||||
|
||||
- **Dual Model Architecture**: BLT consists of two separate trained models:
|
||||
- **Patcher (Entropy Model)**: A smaller transformer model that predicts byte-level entropy to determine patch boundaries and segment input.
|
||||
- **Main Transformer Model**: The primary model that processes the patches through a Local Encoder, Global Transformer, and Local Decoder.
|
||||
|
||||
- **Dynamic Patching**: The model uses entropy-based dynamic patching where:
|
||||
- High-entropy regions (complex data) get shorter patches with more computational attention
|
||||
- Low-entropy regions (predictable data) get longer patches for efficiency
|
||||
- This allows the model to allocate compute resources where they're most needed
|
||||
|
||||
- **Local Encoder**: Processes byte sequences with cross-attention to patch embeddings
|
||||
- **Global Transformer**: Processes patch-level representations with full attention across patches
|
||||
- **Local Decoder**: Generates output with cross-attention back to the original byte sequence
|
||||
|
||||
- **Byte-Level Tokenizer**: Unlike traditional tokenizers that use learned vocabularies, BLT's tokenizer simply converts text to UTF-8 bytes and maps each byte to a token ID. There is no need for a vocabulary.
|
||||
|
||||
The model can be loaded via:
|
||||
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"itazap/blt-1b-hf",
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
prompt = "my name is"
|
||||
generated_ids = model.generate(
|
||||
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
|
||||
)
|
||||
|
||||
print(tokenizer.decode(generated_ids[0]))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
This model was contributed by [itazap](https://huggingface.co/<itazap>).
|
||||
The original code can be found [here](<https://github.com/facebookresearch/blt>).
|
||||
|
||||
## BltConfig
|
||||
|
||||
[[autodoc]] BltConfig
|
||||
|
||||
[[autodoc]] BltModel
|
||||
- forward
|
||||
|
||||
## BltForCausalLM
|
||||
|
||||
[[autodoc]] BltForCausalLM
|
||||
- forward
|
@ -54,6 +54,7 @@ The [`BridgeTowerProcessor`] wraps [`RobertaTokenizer`] and [`BridgeTowerImagePr
|
||||
encode the text and prepare the images respectively.
|
||||
|
||||
The following example shows how to run contrastive learning using [`BridgeTowerProcessor`] and [`BridgeTowerForContrastiveLearning`].
|
||||
|
||||
```python
|
||||
>>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
|
||||
>>> import requests
|
||||
@ -76,6 +77,7 @@ The following example shows how to run contrastive learning using [`BridgeTowerP
|
||||
```
|
||||
|
||||
The following example shows how to run image-text retrieval using [`BridgeTowerProcessor`] and [`BridgeTowerForImageAndTextRetrieval`].
|
||||
|
||||
```python
|
||||
>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
|
||||
>>> import requests
|
||||
@ -130,7 +132,6 @@ Tips:
|
||||
- Please refer to [Table 5](https://huggingface.co/papers/2206.08657) for BridgeTower's performance on Image Retrieval and other down stream tasks.
|
||||
- The PyTorch version of this model is only available in torch 1.10 and higher.
|
||||
|
||||
|
||||
## BridgeTowerConfig
|
||||
|
||||
[[autodoc]] BridgeTowerConfig
|
||||
@ -177,4 +178,3 @@ Tips:
|
||||
|
||||
[[autodoc]] BridgeTowerForImageAndTextRetrieval
|
||||
- forward
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user