mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 17:48:57 +08:00
Compare commits
118 Commits
tools-infe
...
fix_remote
Author | SHA1 | Date | |
---|---|---|---|
ff44fe848d | |||
fe909e6650 | |||
13cc6e534f | |||
9be1243a7d | |||
c2df74b98d | |||
a9c92a7a4a | |||
4258f453fd | |||
e84bcc7997 | |||
4157b35304 | |||
5bdc6c96ec | |||
1d574aef29 | |||
1bd7848d9e | |||
9d706e8342 | |||
7a9137b3bf | |||
4d991ed2ba | |||
69c1abbc55 | |||
43bddabbb4 | |||
a873db85e7 | |||
2711a393d9 | |||
2234ff1126 | |||
2be32455ee | |||
3304f1714c | |||
ac48681ad0 | |||
749a041984 | |||
6293e28d16 | |||
0f1cf429b1 | |||
dc6743db19 | |||
e9b166e8b6 | |||
7f26a004a1 | |||
aedbcd2469 | |||
a740e1a3a4 | |||
4ec7077510 | |||
47040aec29 | |||
3d141d02c0 | |||
4cafdb1098 | |||
fea2e6ce65 | |||
be0a949ad6 | |||
1d00d759f9 | |||
d4fa69d995 | |||
b317a33b6c | |||
1239cdd59c | |||
33903dad2f | |||
85e89c890b | |||
c7609790a7 | |||
325d60ba9c | |||
ed376943bb | |||
0d65be6720 | |||
ff3240b93b | |||
f1dc3b3da1 | |||
ffadafb9f9 | |||
51561d9c13 | |||
a61d5b66ee | |||
b08e4af920 | |||
e4e95f1cf3 | |||
48ccb1aabe | |||
7f4aaf2f7e | |||
2906d2509b | |||
cef1b96ea1 | |||
a255eaa3d0 | |||
895e472484 | |||
8ad381efb7 | |||
6df0c15033 | |||
d8abc32ef6 | |||
85e3acd1d6 | |||
5589cec4c7 | |||
6b64832bbb | |||
34801f1e67 | |||
afbed38879 | |||
0c3fd13884 | |||
290e8306c4 | |||
28fae8b6fa | |||
8b2221a3aa | |||
e819766fdf | |||
74c98e5fef | |||
731029b03b | |||
8c7c78a8f2 | |||
c70005aa11 | |||
0e80e1f75c | |||
2868ea4c9d | |||
f85f4c6b0e | |||
38d71024ab | |||
37f6a02f1e | |||
d88b770d9a | |||
44bea11f6f | |||
d9caa406c2 | |||
c136c19712 | |||
8b1aa74904 | |||
42e16b3761 | |||
b4ebe76fea | |||
cc187725b7 | |||
4cd8879812 | |||
739b7c0d0e | |||
1bf544a036 | |||
3d673f0602 | |||
a463897305 | |||
49766e665d | |||
360cddd78c | |||
9cef2daeae | |||
259b0af144 | |||
6c1f823c83 | |||
77225aa758 | |||
bd2a928909 | |||
d6a36c0343 | |||
c8c766b566 | |||
7fa6db98a2 | |||
0d4600e77d | |||
fd140c0724 | |||
e9e68e856d | |||
e1e47d67f1 | |||
9f3da1d6aa | |||
b8354f845b | |||
67f63d0f91 | |||
3149597939 | |||
98c15ff46f | |||
e00438ffa7 | |||
8d066a02e8 | |||
30221a8d5f | |||
4f9b256b31 |
@ -63,12 +63,6 @@ jobs:
|
||||
else
|
||||
touch test_preparation/examples_test_list.txt
|
||||
fi
|
||||
- run: |
|
||||
if [ -f filtered_test_list_cross_tests.txt ]; then
|
||||
mv filtered_test_list_cross_tests.txt test_preparation/filtered_test_list_cross_tests.txt
|
||||
else
|
||||
touch test_preparation/filtered_test_list_cross_tests.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: test_preparation/test_list.txt
|
||||
- store_artifacts:
|
||||
@ -84,8 +78,6 @@ jobs:
|
||||
- run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt
|
||||
- store_artifacts:
|
||||
path: test_preparation/generated_config.txt
|
||||
- store_artifacts:
|
||||
path: test_preparation/filtered_test_list_cross_tests.txt
|
||||
- continuation/continue:
|
||||
configuration_path: test_preparation/generated_config.yml
|
||||
|
||||
|
@ -51,8 +51,6 @@ class CircleCIJob:
|
||||
resource_class: Optional[str] = "xlarge"
|
||||
tests_to_run: Optional[List[str]] = None
|
||||
working_directory: str = "~/transformers"
|
||||
# This should be only used for doctest job!
|
||||
command_timeout: Optional[int] = None
|
||||
|
||||
def __post_init__(self):
|
||||
# Deal with defaults for mutable attributes.
|
||||
@ -109,15 +107,11 @@ class CircleCIJob:
|
||||
steps.append({"store_artifacts": {"path": "~/transformers/installed.txt"}})
|
||||
|
||||
all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
|
||||
pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
|
||||
pytest_flags = [f"--{key}={value}" if value is not None else f"-{key}" for key, value in all_options.items()]
|
||||
pytest_flags.append(
|
||||
f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
|
||||
)
|
||||
test_command = ""
|
||||
if self.command_timeout:
|
||||
test_command = f"timeout {self.command_timeout} "
|
||||
test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
|
||||
|
||||
test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
|
||||
if self.parallelism == 1:
|
||||
if self.tests_to_run is None:
|
||||
test_command += " << pipeline.parameters.tests_to_run >>"
|
||||
@ -167,37 +161,12 @@ class CircleCIJob:
|
||||
steps.append({"store_artifacts": {"path": "~/transformers/tests.txt"}})
|
||||
steps.append({"store_artifacts": {"path": "~/transformers/splitted_tests.txt"}})
|
||||
|
||||
test_command = ""
|
||||
if self.timeout:
|
||||
test_command = f"timeout {self.timeout} "
|
||||
test_command += f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
|
||||
test_command = f"python -m pytest -n {self.pytest_num_workers} " + " ".join(pytest_flags)
|
||||
test_command += " $(cat splitted_tests.txt)"
|
||||
if self.marker is not None:
|
||||
test_command += f" -m {self.marker}"
|
||||
|
||||
if self.name == "pr_documentation_tests":
|
||||
# can't use ` | tee tee tests_output.txt` as usual
|
||||
test_command += " > tests_output.txt"
|
||||
# Save the return code, so we can check if it is timeout in the next step.
|
||||
test_command += '; touch "$?".txt'
|
||||
# Never fail the test step for the doctest job. We will check the results in the next step, and fail that
|
||||
# step instead if the actual test failures are found. This is to avoid the timeout being reported as test
|
||||
# failure.
|
||||
test_command = f"({test_command}) || true"
|
||||
else:
|
||||
test_command += " | tee tests_output.txt"
|
||||
test_command += " | tee tests_output.txt"
|
||||
steps.append({"run": {"name": "Run tests", "command": test_command}})
|
||||
|
||||
# return code `124` means the previous (pytest run) step is timeout
|
||||
if self.name == "pr_documentation_tests":
|
||||
checkout_doctest_command = 'if [ -s reports/tests_pr_documentation_tests/failures_short.txt ]; '
|
||||
checkout_doctest_command += 'then echo "some test failed"; '
|
||||
checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/failures_short.txt; '
|
||||
checkout_doctest_command += 'cat reports/tests_pr_documentation_tests/summary_short.txt; exit -1; '
|
||||
checkout_doctest_command += 'elif [ -s reports/tests_pr_documentation_tests/stats.txt ]; then echo "All tests pass!"; '
|
||||
checkout_doctest_command += 'elif [ -f 124.txt ]; then echo "doctest timeout!"; else echo "other fatal error)"; exit -1; fi;'
|
||||
steps.append({"run": {"name": "Check doctest results", "command": checkout_doctest_command}})
|
||||
|
||||
steps.append({"store_artifacts": {"path": "~/transformers/tests_output.txt"}})
|
||||
steps.append({"store_artifacts": {"path": "~/transformers/reports"}})
|
||||
job["steps"] = steps
|
||||
@ -432,47 +401,6 @@ repo_utils_job = CircleCIJob(
|
||||
tests_to_run="tests/repo_utils",
|
||||
)
|
||||
|
||||
|
||||
# We also include a `dummy.py` file in the files to be doc-tested to prevent edge case failure. Otherwise, the pytest
|
||||
# hangs forever during test collection while showing `collecting 0 items / 21 errors`. (To see this, we have to remove
|
||||
# the bash output redirection.)
|
||||
py_command = 'from utils.tests_fetcher import get_doctest_files; to_test = get_doctest_files() + ["dummy.py"]; to_test = " ".join(to_test); print(to_test)'
|
||||
py_command = f"$(python3 -c '{py_command}')"
|
||||
command = f'echo "{py_command}" > pr_documentation_tests_temp.txt'
|
||||
doc_test_job = CircleCIJob(
|
||||
"pr_documentation_tests",
|
||||
additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
|
||||
install_steps=[
|
||||
"sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time",
|
||||
"pip install --upgrade pip",
|
||||
"pip install -e .[dev]",
|
||||
"pip install git+https://github.com/huggingface/accelerate",
|
||||
"pip install --upgrade pytest pytest-sugar",
|
||||
"find -name __pycache__ -delete",
|
||||
"find . -name \*.pyc -delete",
|
||||
# Add an empty file to keep the test step running correctly even no file is selected to be tested.
|
||||
"touch dummy.py",
|
||||
{
|
||||
"name": "Get files to test",
|
||||
"command": command,
|
||||
},
|
||||
{
|
||||
"name": "Show information in `Get files to test`",
|
||||
"command":
|
||||
"cat pr_documentation_tests_temp.txt"
|
||||
},
|
||||
{
|
||||
"name": "Get the last line in `pr_documentation_tests.txt`",
|
||||
"command":
|
||||
"tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests.txt"
|
||||
},
|
||||
],
|
||||
tests_to_run="$(cat pr_documentation_tests.txt)", # noqa
|
||||
pytest_options={"-doctest-modules": None, "doctest-glob": "*.mdx", "dist": "loadfile", "rvsA": None},
|
||||
command_timeout=1200, # test cannot run longer than 1200 seconds
|
||||
pytest_num_workers=1,
|
||||
)
|
||||
|
||||
REGULAR_TESTS = [
|
||||
torch_and_tf_job,
|
||||
torch_and_flax_job,
|
||||
@ -483,7 +411,6 @@ REGULAR_TESTS = [
|
||||
hub_job,
|
||||
onnx_job,
|
||||
exotic_models_job,
|
||||
doc_test_job
|
||||
]
|
||||
EXAMPLES_TESTS = [
|
||||
examples_torch_job,
|
||||
@ -520,34 +447,6 @@ def create_circleci_config(folder=None):
|
||||
if len(test_list) > 0:
|
||||
jobs.extend(REGULAR_TESTS)
|
||||
|
||||
extended_tests_to_run = set(test_list.split())
|
||||
# Extend the test files for cross test jobs
|
||||
for job in jobs:
|
||||
if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]:
|
||||
for test_path in copy.copy(extended_tests_to_run):
|
||||
dir_path, fn = os.path.split(test_path)
|
||||
if fn.startswith("test_modeling_tf_"):
|
||||
fn = fn.replace("test_modeling_tf_", "test_modeling_")
|
||||
elif fn.startswith("test_modeling_flax_"):
|
||||
fn = fn.replace("test_modeling_flax_", "test_modeling_")
|
||||
else:
|
||||
if job.job_name == "test_torch_and_tf":
|
||||
fn = fn.replace("test_modeling_", "test_modeling_tf_")
|
||||
elif job.job_name == "test_torch_and_flax":
|
||||
fn = fn.replace("test_modeling_", "test_modeling_flax_")
|
||||
new_test_file = str(os.path.join(dir_path, fn))
|
||||
if os.path.isfile(new_test_file):
|
||||
if new_test_file not in extended_tests_to_run:
|
||||
extended_tests_to_run.add(new_test_file)
|
||||
extended_tests_to_run = sorted(extended_tests_to_run)
|
||||
for job in jobs:
|
||||
if job.job_name in ["tests_torch_and_tf", "tests_torch_and_flax"]:
|
||||
job.tests_to_run = extended_tests_to_run
|
||||
fn = "filtered_test_list_cross_tests.txt"
|
||||
f_path = os.path.join(folder, fn)
|
||||
with open(f_path, "w") as fp:
|
||||
fp.write(" ".join(extended_tests_to_run))
|
||||
|
||||
example_file = os.path.join(folder, "examples_test_list.txt")
|
||||
if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
|
||||
jobs.extend(EXAMPLES_TESTS)
|
||||
|
14
.github/workflows/doctests.yml
vendored
14
.github/workflows/doctests.yml
vendored
@ -25,17 +25,11 @@ jobs:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: uninstall transformers (installed during docker image build)
|
||||
run: python3 -m pip uninstall -y transformers
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install transformers in edit mode
|
||||
run: python3 -m pip install -e .
|
||||
|
||||
- name: GPU visibility
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
@ -43,10 +37,18 @@ jobs:
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Prepare files for doctests
|
||||
run: |
|
||||
python3 utils/prepare_for_doc_test.py src docs
|
||||
|
||||
- name: Run doctests
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
|
||||
|
||||
- name: Clean files after doctests
|
||||
run: |
|
||||
python3 utils/prepare_for_doc_test.py src docs --remove_new_line
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
|
8
Makefile
8
Makefile
@ -47,10 +47,10 @@ repo-consistency:
|
||||
# this target runs checks on all files
|
||||
|
||||
quality:
|
||||
black --check $(check_dirs) setup.py conftest.py
|
||||
black --check $(check_dirs) setup.py
|
||||
python utils/custom_init_isort.py --check_only
|
||||
python utils/sort_auto_mappings.py --check_only
|
||||
ruff $(check_dirs) setup.py conftest.py
|
||||
ruff $(check_dirs) setup.py
|
||||
doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
python utils/check_doc_toc.py
|
||||
|
||||
@ -65,8 +65,8 @@ extra_style_checks:
|
||||
# this target runs checks on all files and potentially modifies some of them
|
||||
|
||||
style:
|
||||
black $(check_dirs) setup.py conftest.py
|
||||
ruff $(check_dirs) setup.py conftest.py --fix
|
||||
black $(check_dirs) setup.py
|
||||
ruff $(check_dirs) setup.py --fix
|
||||
${MAKE} autogenerate_code
|
||||
${MAKE} extra_style_checks
|
||||
|
||||
|
@ -301,7 +301,7 @@ Current number of checkpoints: ** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
@ -341,7 +341,7 @@ Current number of checkpoints: ** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
@ -400,7 +400,6 @@ Current number of checkpoints: ** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
@ -422,9 +421,8 @@ Current number of checkpoints: ** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
@ -432,7 +430,6 @@ Current number of checkpoints: ** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
|
@ -289,7 +289,7 @@ Número actual de puntos de control: ** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
@ -329,7 +329,7 @@ Número actual de puntos de control: ** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
@ -388,7 +388,6 @@ Número actual de puntos de control: ** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
@ -410,9 +409,8 @@ Número actual de puntos de control: ** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
@ -420,7 +418,6 @@ Número actual de puntos de control: ** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
|
@ -261,7 +261,7 @@ conda install -c huggingface transformers
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (इनरिया/फेसबुक/सोरबोन से) साथ में कागज [CamemBERT: एक टेस्टी फ्रेंच लैंग्वेज मॉडल](https:// arxiv.org/abs/1911.03894) लुई मार्टिन*, बेंजामिन मुलर*, पेड्रो जेवियर ऑर्टिज़ सुआरेज़*, योआन ड्यूपॉन्ट, लॉरेंट रोमरी, एरिक विलेमोन्टे डे ला क्लर्जरी, जैमे सेडाह और बेनोइट सगोट द्वारा।
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google रिसर्च से) साथ में दिया गया पेपर [कैनाइन: प्री-ट्रेनिंग ए एफिशिएंट टोकनाइजेशन-फ्री एनकोडर फॉर लैंग्वेज रिप्रेजेंटेशन]( https://arxiv.org/abs/2103.06874) जोनाथन एच क्लार्क, डैन गैरेट, यूलिया टर्क, जॉन विएटिंग द्वारा।
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI से) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. द्वाराअनुसंधान पत्र [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) के साथ जारी किया गया
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI से) साथ वाला पेपर [लर्निंग ट्रांसफरेबल विजुअल मॉडल फ्रॉम नेचुरल लैंग्वेज सुपरविजन](https://arxiv.org /abs/2103.00020) एलेक रैडफोर्ड, जोंग वूक किम, क्रिस हैलासी, आदित्य रमेश, गेब्रियल गोह, संध्या अग्रवाल, गिरीश शास्त्री, अमांडा एस्केल, पामेला मिश्किन, जैक क्लार्क, ग्रेचेन क्रुएगर, इल्या सुत्स्केवर द्वारा।
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (सेल्सफोर्स से) साथ में पेपर [प्रोग्राम सिंथेसिस के लिए एक संवादात्मक प्रतिमान](https://arxiv.org/abs/2203.13474) एरिक निजकैंप, बो पैंग, हिरोआकी हयाशी, लिफू तू, हुआन वांग, यिंगबो झोउ, सिल्वियो सावरेस, कैमिंग जिओंग रिलीज।
|
||||
@ -301,7 +301,7 @@ conda install -c huggingface transformers
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा।
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (FLAVA: A फाउंडेशनल लैंग्वेज एंड विजन अलाइनमेंट मॉडल) (https://arxiv) साथ वाला पेपर .org/abs/2112.04482) अमनप्रीत सिंह, रोंगहांग हू, वेदानुज गोस्वामी, गुइल्यूम कुएरॉन, वोज्शिएक गालुबा, मार्कस रोहरबैक, और डौवे कीला द्वारा।
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा।
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले द्वारा रिहाई।
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा।
|
||||
@ -360,7 +360,6 @@ conda install -c huggingface transformers
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta से) the NLLB team. द्वाराअनुसंधान पत्र [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) के साथ जारी किया गया
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया।
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
@ -382,9 +381,8 @@ conda install -c huggingface transformers
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP से) साथ में पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स] (https://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योआव आर्टज़ी द्वारा पोस्ट किया गया।
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
@ -392,7 +390,6 @@ conda install -c huggingface transformers
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (फेसबुक से) साथ में पेपर [लार्ज-स्केल सेल्फ- एंड सेमी-सुपरवाइज्ड लर्निंग फॉर स्पीच ट्रांसलेशन](https://arxiv.org/abs/2104.06678) चांगहान वांग, ऐनी वू, जुआन पिनो, एलेक्सी बेवस्की, माइकल औली, एलेक्सिस द्वारा Conneau द्वारा पोस्ट किया गया।
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (तेल अवीव यूनिवर्सिटी से) साथ में पेपर [स्पैन सिलेक्शन को प्री-ट्रेनिंग करके कुछ-शॉट क्वेश्चन आंसरिंग](https:// arxiv.org/abs/2101.00438) ओरि राम, युवल कर्स्टन, जोनाथन बेरेंट, अमीर ग्लोबर्सन, ओमर लेवी द्वारा।
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: कुशल तंत्रिका नेटवर्क के बारे में NLP को कंप्यूटर विज़न क्या सिखा सकता है?](https: //arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा।
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (माइक्रोसॉफ्ट से) साथ में कागज [स्वाइन ट्रांसफॉर्मर: शिफ्टेड विंडोज का उपयोग कर पदानुक्रमित विजन ट्रांसफॉर्मर](https://arxiv .org/abs/2103.14030) ज़ी लियू, युटोंग लिन, यू काओ, हान हू, यिक्सुआन वेई, झेंग झांग, स्टीफन लिन, बैनिंग गुओ द्वारा।
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft से) साथ वाला पेपर [Swin Transformer V2: स्केलिंग अप कैपेसिटी एंड रेजोल्यूशन](https:// ज़ी लियू, हान हू, युटोंग लिन, ज़ुलिआंग याओ, ज़ेंडा ज़ी, यिक्सुआन वेई, जिया निंग, यू काओ, झेंग झांग, ली डोंग, फुरु वेई, बैनिंग गुओ द्वारा arxiv.org/abs/2111.09883।
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
|
@ -323,7 +323,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne から) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot から公開された研究論文: [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research から) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting から公開された研究論文: [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874)
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys から) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou から公開された研究論文: [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335)
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI から) Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. から公開された研究論文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI から) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever から公開された研究論文: [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen から) Timo Lüddecke and Alexander Ecker から公開された研究論文: [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003)
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce から) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong から公開された研究論文: [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)
|
||||
@ -363,7 +363,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372)
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (Facebook AI から) Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela から公開された研究論文: [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482)
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
|
||||
@ -422,7 +422,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta から) the NLLB team. から公開された研究論文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
|
||||
@ -444,9 +443,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook から) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli から公開された研究論文: [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038)
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research から) Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. から公開された研究論文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)
|
||||
@ -454,7 +452,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft から) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo から公開された研究論文: [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft から) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo から公開された研究論文: [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883)
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg から) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte から公開された研究論文: [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345)
|
||||
|
@ -238,7 +238,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (Inria/Facebook/Sorbonne 에서) Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 의 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 논문과 함께 발표했습니다.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (Google Research 에서) Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 의 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 논문과 함께 발표했습니다.
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (OFA-Sys 에서) An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 의 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 논문과 함께 발표했습니다.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (LAION-AI 에서 제공)은 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.의 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687)논문과 함께 발표했습니다.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (OpenAI 에서) Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 의 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 논문과 함께 발표했습니다.
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (University of Göttingen 에서) Timo Lüddecke and Alexander Ecker 의 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 논문과 함께 발표했습니다.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (Salesforce 에서) Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 의 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 논문과 함께 발표했습니다.
|
||||
@ -278,7 +278,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
@ -337,7 +337,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta 에서 제공)은 the NLLB team.의 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)논문과 함께 발표했습니다.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
|
||||
@ -359,9 +358,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (Facebook 에서) Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 의 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 논문과 함께 발표했습니다.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (Microsoft Research 에서 제공)은 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.의 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205)논문과 함께 발표했습니다.
|
||||
@ -369,7 +367,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (Microsoft 에서) Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 의 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 논문과 함께 발표했습니다.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (Microsoft 에서) Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 의 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 논문과 함께 발표했습니다.
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (University of Würzburg 에서) Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 의 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 논문과 함께 발표했습니다.
|
||||
|
@ -262,7 +262,7 @@ conda install -c huggingface transformers
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (来自 OFA-Sys) 伴随论文 [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) 由 An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou 发布。
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (来自 LAION-AI) 伴随论文 [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) 由 Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov 发布。
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (来自 University of Göttingen) 伴随论文 [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) 由 Timo Lüddecke and Alexander Ecker 发布。
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
|
||||
@ -302,7 +302,7 @@ conda install -c huggingface transformers
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
|
||||
@ -361,7 +361,6 @@ conda install -c huggingface transformers
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (来自 Meta) 伴随论文 [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) 由 the NLLB team 发布。
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs) 伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布.
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
|
||||
@ -383,9 +382,8 @@ conda install -c huggingface transformers
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (来自 Facebook) 伴随论文 [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) 由 Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli 发布。
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (来自 Microsoft Research) 伴随论文 [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) 由 Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei 发布。
|
||||
@ -393,7 +391,6 @@ conda install -c huggingface transformers
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (来自 University of Würzburg) 伴随论文 [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) 由 Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte 发布。
|
||||
|
@ -274,7 +274,7 @@ conda install -c huggingface transformers
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
@ -314,7 +314,7 @@ conda install -c huggingface transformers
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/main/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
@ -373,7 +373,6 @@ conda install -c huggingface transformers
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
@ -395,9 +394,8 @@ conda install -c huggingface transformers
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/main/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
@ -405,7 +403,6 @@ conda install -c huggingface transformers
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/main/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
|
13
conftest.py
13
conftest.py
@ -20,10 +20,6 @@ import sys
|
||||
import warnings
|
||||
from os.path import abspath, dirname, join
|
||||
|
||||
import _pytest
|
||||
|
||||
from transformers.testing_utils import HfDoctestModule, HfDocTestParser
|
||||
|
||||
|
||||
# allow having multiple repository checkouts and not needing to remember to rerun
|
||||
# 'pip install -e .[dev]' when switching between checkouts and running tests.
|
||||
@ -42,10 +38,11 @@ def pytest_configure(config):
|
||||
config.addinivalue_line(
|
||||
"markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
|
||||
)
|
||||
config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
|
||||
config.addinivalue_line(
|
||||
"markers", "is_pipeline_test: mark test to run only when pipelines are tested"
|
||||
)
|
||||
config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
|
||||
config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
|
||||
config.addinivalue_line("markers", "tool_tests: mark the tool tests that are run on their specific schedule")
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
@ -69,7 +66,7 @@ def pytest_sessionfinish(session, exitstatus):
|
||||
|
||||
|
||||
# Doctest custom flag to ignore output.
|
||||
IGNORE_RESULT = doctest.register_optionflag("IGNORE_RESULT")
|
||||
IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')
|
||||
|
||||
OutputChecker = doctest.OutputChecker
|
||||
|
||||
@ -82,5 +79,3 @@ class CustomOutputChecker(OutputChecker):
|
||||
|
||||
|
||||
doctest.OutputChecker = CustomOutputChecker
|
||||
_pytest.doctest.DoctestModule = HfDoctestModule
|
||||
doctest.DocTestParser = HfDocTestParser
|
||||
|
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
|
||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||
# to be used as arguments for docker build (so far).
|
||||
|
||||
ARG PYTORCH='2.0.1'
|
||||
ARG PYTORCH='2.0.0'
|
||||
# (not always a valid torch version)
|
||||
ARG INTEL_TORCH_EXT='1.11.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
@ -32,9 +32,15 @@ RUN echo torch=$VERSION
|
||||
# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI).
|
||||
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir -U tensorflow==2.12 protobuf==3.20.3 tensorflow_text tensorflow_probability
|
||||
RUN python3 -m pip install --no-cache-dir -U tensorflow==2.11
|
||||
RUN python3 -m pip install --no-cache-dir -U tensorflow_probability
|
||||
RUN python3 -m pip uninstall -y flax jax
|
||||
|
||||
# To include the change in this commit https://github.com/onnx/tensorflow-onnx/commit/ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
|
||||
# Otherwise, we get tf2onnx==1.8 (caused by `flatbuffers` version), and some tests fail with `ValueError: from_keras requires input_signature`.
|
||||
# TODO: remove this line once the conflict is resolved in these libraries.
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow-onnx.git@ddca3a5eb2d912f20fe7e0568dd1a3013aee9fa3
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
|
||||
|
@ -4,7 +4,7 @@ LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG PYTORCH='2.0.1'
|
||||
ARG PYTORCH='2.0.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu117'
|
||||
|
||||
|
@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||
|
||||
# If set to nothing, will install the latest version
|
||||
ARG PYTORCH='2.0.1'
|
||||
ARG PYTORCH='2.0.0'
|
||||
ARG TORCH_VISION=''
|
||||
ARG TORCH_AUDIO=''
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
|
@ -12,7 +12,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
|
||||
|
||||
# If set to nothing, will install the latest version
|
||||
ARG TENSORFLOW='2.12'
|
||||
ARG TENSORFLOW='2.11'
|
||||
|
||||
RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
|
||||
RUN python3 -m pip uninstall -y torch flax
|
||||
|
@ -8,24 +8,45 @@
|
||||
title: Get started
|
||||
- sections:
|
||||
- local: pipeline_tutorial
|
||||
title: Run inference with pipelines
|
||||
title: Pipelines for inference
|
||||
- local: autoclass_tutorial
|
||||
title: Write portable code with AutoClass
|
||||
title: Load pretrained instances with an AutoClass
|
||||
- local: preprocessing
|
||||
title: Preprocess data
|
||||
title: Preprocess
|
||||
- local: training
|
||||
title: Fine-tune a pretrained model
|
||||
- local: run_scripts
|
||||
title: Train with a script
|
||||
- local: accelerate
|
||||
title: Set up distributed training with 🤗 Accelerate
|
||||
title: Distributed training with 🤗 Accelerate
|
||||
- local: model_sharing
|
||||
title: Share your model
|
||||
- local: transformers_agents
|
||||
title: Agents
|
||||
title: Share a model
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- sections:
|
||||
- local: create_a_model
|
||||
title: Create a custom architecture
|
||||
- local: custom_models
|
||||
title: Sharing custom models
|
||||
- local: run_scripts
|
||||
title: Train with a script
|
||||
- local: sagemaker
|
||||
title: Run training on Amazon SageMaker
|
||||
- local: converting_tensorflow_models
|
||||
title: Converting from TensorFlow checkpoints
|
||||
- local: serialization
|
||||
title: Export to ONNX
|
||||
- local: torchscript
|
||||
title: Export to TorchScript
|
||||
- local: troubleshooting
|
||||
title: Troubleshoot
|
||||
title: General usage
|
||||
- sections:
|
||||
- local: fast_tokenizers
|
||||
title: Use tokenizers from 🤗 Tokenizers
|
||||
- local: multilingual
|
||||
title: Inference for multilingual models
|
||||
- local: generation_strategies
|
||||
title: Text generation strategies
|
||||
- sections:
|
||||
- local: tasks/sequence_classification
|
||||
title: Text classification
|
||||
- local: tasks/token_classification
|
||||
@ -42,71 +63,38 @@
|
||||
title: Summarization
|
||||
- local: tasks/multiple_choice
|
||||
title: Multiple choice
|
||||
title: Task guides
|
||||
isExpanded: false
|
||||
title: Natural Language Processing
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: tasks/audio_classification
|
||||
title: Audio classification
|
||||
- local: tasks/asr
|
||||
title: Automatic speech recognition
|
||||
- local: tasks/audio_classification
|
||||
title: Audio classification
|
||||
- local: tasks/asr
|
||||
title: Automatic speech recognition
|
||||
title: Audio
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: tasks/image_classification
|
||||
title: Image classification
|
||||
- local: tasks/semantic_segmentation
|
||||
title: Semantic segmentation
|
||||
- local: tasks/video_classification
|
||||
title: Video classification
|
||||
- local: tasks/object_detection
|
||||
title: Object detection
|
||||
- local: tasks/zero_shot_object_detection
|
||||
title: Zero-shot object detection
|
||||
- local: tasks/zero_shot_image_classification
|
||||
title: Zero-shot image classification
|
||||
- local: tasks/monocular_depth_estimation
|
||||
title: Depth estimation
|
||||
- local: tasks/image_classification
|
||||
title: Image classification
|
||||
- local: tasks/semantic_segmentation
|
||||
title: Semantic segmentation
|
||||
- local: tasks/video_classification
|
||||
title: Video classification
|
||||
- local: tasks/object_detection
|
||||
title: Object detection
|
||||
- local: tasks/zero_shot_object_detection
|
||||
title: Zero-shot object detection
|
||||
- local: tasks/zero_shot_image_classification
|
||||
title: Zero-shot image classification
|
||||
- local: tasks/monocular_depth_estimation
|
||||
title: Depth estimation
|
||||
title: Computer Vision
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: tasks/image_captioning
|
||||
title: Image captioning
|
||||
- local: tasks/document_question_answering
|
||||
title: Document Question Answering
|
||||
- local: tasks/text-to-speech
|
||||
title: Text to speech
|
||||
- local: tasks/image_captioning
|
||||
title: Image captioning
|
||||
- local: tasks/document_question_answering
|
||||
title: Document Question Answering
|
||||
title: Multimodal
|
||||
isExpanded: false
|
||||
title: Task Guides
|
||||
- sections:
|
||||
- local: fast_tokenizers
|
||||
title: Use fast tokenizers from 🤗 Tokenizers
|
||||
- local: multilingual
|
||||
title: Run inference with multilingual models
|
||||
- local: generation_strategies
|
||||
title: Customize text generation strategy
|
||||
- local: create_a_model
|
||||
title: Use model-specific APIs
|
||||
- local: custom_models
|
||||
title: Share a custom model
|
||||
- local: sagemaker
|
||||
title: Run training on Amazon SageMaker
|
||||
- local: serialization
|
||||
title: Export to ONNX
|
||||
- local: torchscript
|
||||
title: Export to TorchScript
|
||||
- local: benchmarks
|
||||
title: Benchmarks
|
||||
- local: notebooks
|
||||
title: Notebooks with examples
|
||||
- local: community
|
||||
title: Community resources
|
||||
- local: custom_tools
|
||||
title: Custom Tools and Prompts
|
||||
- local: troubleshooting
|
||||
title: Troubleshoot
|
||||
title: Developer guides
|
||||
- sections:
|
||||
- sections:
|
||||
- local: performance
|
||||
title: Overview
|
||||
- local: perf_train_gpu_one
|
||||
@ -141,8 +129,8 @@
|
||||
title: Hyperparameter Search using Trainer API
|
||||
- local: tf_xla
|
||||
title: XLA Integration for TensorFlow Models
|
||||
title: Performance and scalability
|
||||
- sections:
|
||||
title: Performance and scalability
|
||||
- sections:
|
||||
- local: contributing
|
||||
title: How to contribute to transformers?
|
||||
- local: add_new_model
|
||||
@ -155,8 +143,16 @@
|
||||
title: Testing
|
||||
- local: pr_checks
|
||||
title: Checks on a Pull Request
|
||||
title: Contribute
|
||||
|
||||
title: Contribute
|
||||
- local: notebooks
|
||||
title: 🤗 Transformers Notebooks
|
||||
- local: community
|
||||
title: Community resources
|
||||
- local: benchmarks
|
||||
title: Benchmarks
|
||||
- local: migration
|
||||
title: Migrating from previous packages
|
||||
title: How-to guides
|
||||
- sections:
|
||||
- local: philosophy
|
||||
title: Philosophy
|
||||
@ -183,8 +179,6 @@
|
||||
title: Conceptual guides
|
||||
- sections:
|
||||
- sections:
|
||||
- local: main_classes/agent
|
||||
title: Agents and Tools
|
||||
- local: model_doc/auto
|
||||
title: Auto Classes
|
||||
- local: main_classes/callback
|
||||
@ -371,8 +365,6 @@
|
||||
title: NLLB-MoE
|
||||
- local: model_doc/nystromformer
|
||||
title: Nyströmformer
|
||||
- local: model_doc/open-llama
|
||||
title: Open-Llama
|
||||
- local: model_doc/opt
|
||||
title: OPT
|
||||
- local: model_doc/pegasus
|
||||
@ -405,8 +397,6 @@
|
||||
title: RoCBert
|
||||
- local: model_doc/roformer
|
||||
title: RoFormer
|
||||
- local: model_doc/rwkv
|
||||
title: RWKV
|
||||
- local: model_doc/splinter
|
||||
title: Splinter
|
||||
- local: model_doc/squeezebert
|
||||
@ -502,8 +492,6 @@
|
||||
title: ResNet
|
||||
- local: model_doc/segformer
|
||||
title: SegFormer
|
||||
- local: model_doc/swiftformer
|
||||
title: SwiftFormer
|
||||
- local: model_doc/swin
|
||||
title: Swin Transformer
|
||||
- local: model_doc/swinv2
|
||||
|
@ -678,7 +678,7 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")
|
||||
**7. Implement the forward pass**
|
||||
|
||||
Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
|
||||
sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#34-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
|
||||
sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
|
||||
pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
|
||||
implementation instead of the original one. It should look as follows:
|
||||
|
||||
|
162
docs/source/en/converting_tensorflow_models.mdx
Normal file
162
docs/source/en/converting_tensorflow_models.mdx
Normal file
@ -0,0 +1,162 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Converting From Tensorflow Checkpoints
|
||||
|
||||
A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
|
||||
that can be loaded using the `from_pretrained` methods of the library.
|
||||
|
||||
<Tip>
|
||||
|
||||
Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
|
||||
transformers >= 2.3.0 installation.
|
||||
|
||||
The documentation below reflects the **transformers-cli convert** command format.
|
||||
|
||||
</Tip>
|
||||
|
||||
## BERT
|
||||
|
||||
You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the
|
||||
[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
|
||||
|
||||
This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated
|
||||
configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from
|
||||
the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
|
||||
be imported using `from_pretrained()` (see example in [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_glue.py) ).
|
||||
|
||||
You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
|
||||
checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (\
|
||||
`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
|
||||
|
||||
To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
|
||||
|
||||
Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
|
||||
|
||||
```bash
|
||||
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
||||
|
||||
transformers-cli convert --model_type bert \
|
||||
--tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
|
||||
--config $BERT_BASE_DIR/bert_config.json \
|
||||
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
|
||||
```
|
||||
|
||||
You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
|
||||
|
||||
## ALBERT
|
||||
|
||||
Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
|
||||
[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/main/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
|
||||
|
||||
The CLI takes as input a TensorFlow checkpoint (three files starting with `model.ckpt-best`) and the accompanying
|
||||
configuration file (`albert_config.json`), then creates and saves a PyTorch model. To run this conversion you will
|
||||
need to have TensorFlow and PyTorch installed.
|
||||
|
||||
Here is an example of the conversion process for the pre-trained `ALBERT Base` model:
|
||||
|
||||
```bash
|
||||
export ALBERT_BASE_DIR=/path/to/albert/albert_base
|
||||
|
||||
transformers-cli convert --model_type albert \
|
||||
--tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
|
||||
--config $ALBERT_BASE_DIR/albert_config.json \
|
||||
--pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
|
||||
```
|
||||
|
||||
You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/albert#pre-trained-models).
|
||||
|
||||
## OpenAI GPT
|
||||
|
||||
Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
|
||||
save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm)\
|
||||
)
|
||||
|
||||
```bash
|
||||
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
||||
|
||||
transformers-cli convert --model_type gpt \
|
||||
--tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--config OPENAI_GPT_CONFIG] \
|
||||
[--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
|
||||
```
|
||||
|
||||
## OpenAI GPT-2
|
||||
|
||||
Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see [here](https://github.com/openai/gpt-2))
|
||||
|
||||
```bash
|
||||
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
||||
|
||||
transformers-cli convert --model_type gpt2 \
|
||||
--tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--config OPENAI_GPT2_CONFIG] \
|
||||
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
|
||||
```
|
||||
|
||||
## Transformer-XL
|
||||
|
||||
Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
|
||||
|
||||
```bash
|
||||
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
||||
|
||||
transformers-cli convert --model_type transfo_xl \
|
||||
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--config TRANSFO_XL_CONFIG] \
|
||||
[--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
|
||||
```
|
||||
|
||||
## XLNet
|
||||
|
||||
Here is an example of the conversion process for a pre-trained XLNet model:
|
||||
|
||||
```bash
|
||||
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
||||
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
||||
|
||||
transformers-cli convert --model_type xlnet \
|
||||
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
|
||||
--config $TRANSFO_XL_CONFIG_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--finetuning_task_name XLNET_FINETUNED_TASK] \
|
||||
```
|
||||
|
||||
## XLM
|
||||
|
||||
Here is an example of the conversion process for a pre-trained XLM model:
|
||||
|
||||
```bash
|
||||
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
||||
|
||||
transformers-cli convert --model_type xlm \
|
||||
--tf_checkpoint $XLM_CHECKPOINT_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
|
||||
[--config XML_CONFIG] \
|
||||
[--finetuning_task_name XML_FINETUNED_TASK]
|
||||
```
|
||||
|
||||
## T5
|
||||
|
||||
Here is an example of the conversion process for a pre-trained T5 model:
|
||||
|
||||
```bash
|
||||
export T5=/path/to/t5/uncased_L-12_H-768_A-12
|
||||
|
||||
transformers-cli convert --model_type t5 \
|
||||
--tf_checkpoint $T5/t5_model.ckpt \
|
||||
--config $T5/t5_config.json \
|
||||
--pytorch_dump_output $T5/pytorch_model.bin
|
||||
```
|
@ -1,778 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Custom Tools and Prompts
|
||||
|
||||
<Tip>
|
||||
|
||||
If you are not aware of what tools and agents are in the context of transformers, we recommend you read the
|
||||
[Transformers Agents](transformers_agents) page first.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Transformers Agent is an experimental API that is subject to change at any time. Results returned by the agents
|
||||
can vary as the APIs or underlying models are prone to change.
|
||||
|
||||
</Tip>
|
||||
|
||||
Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks.
|
||||
In this guide we'll take a look at:
|
||||
|
||||
- How to customize the prompt
|
||||
- How to use custom tools
|
||||
- How to create custom tools
|
||||
|
||||
## Customizing the prompt
|
||||
|
||||
As explained in [Transformers Agents](transformers_agents) agents can run in [`~Agent.run`] and [`~Agent.chat`] mode.
|
||||
Both the `run` and `chat` modes underlie the same logic. The language model powering the agent is conditioned on a long
|
||||
prompt and completes the prompt by generating the next tokens until the stop token is reached.
|
||||
The only difference between the two modes is that during the `chat` mode the prompt is extended with
|
||||
previous user inputs and model generations. This allows the agent to have access to past interactions,
|
||||
seemingly giving the agent some kind of memory.
|
||||
|
||||
### Structure of the prompt
|
||||
|
||||
Let's take a closer look at how the prompt is structured to understand how it can be best customized.
|
||||
The prompt is structured broadly into four parts.
|
||||
|
||||
- 1. Introduction: how the agent should behave, explanation of the concept of tools.
|
||||
- 2. Description of all the tools. This is defined by a `<<all_tools>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
|
||||
- 3. A set of examples of tasks and their solution
|
||||
- 4. Current example, and request for solution.
|
||||
|
||||
To better understand each part, let's look at a shortened version of how the `run` prompt can look like:
|
||||
|
||||
````text
|
||||
I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task.
|
||||
[...]
|
||||
You can print intermediate results if it makes sense to do so.
|
||||
|
||||
Tools:
|
||||
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
|
||||
- image_captioner: This is a tool that generates a description of an image. It takes an input named `image` which should be the image to the caption and returns a text that contains the description in English.
|
||||
[...]
|
||||
|
||||
Task: "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French."
|
||||
|
||||
I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
|
||||
|
||||
Answer:
|
||||
```py
|
||||
translated_question = translator(question=question, src_lang="French", tgt_lang="English")
|
||||
print(f"The translated question is {translated_question}.")
|
||||
answer = image_qa(image=image, question=translated_question)
|
||||
print(f"The answer is {answer}")
|
||||
```
|
||||
|
||||
Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
|
||||
|
||||
I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
|
||||
|
||||
Answer:
|
||||
```py
|
||||
answer = document_qa(document, question="What is the oldest person?")
|
||||
print(f"The answer is {answer}.")
|
||||
image = image_generator("A banner showing " + answer)
|
||||
```
|
||||
|
||||
[...]
|
||||
|
||||
Task: "Draw me a picture of rivers and lakes"
|
||||
|
||||
I will use the following
|
||||
````
|
||||
|
||||
The introduction (the text before *"Tools:"*) explains precisely how the model shall behave and what it should do.
|
||||
This part most likely does not need to be customized as the agent shall always behave the same way.
|
||||
|
||||
The second part (the bullet points below *"Tools"*) is dynamically added upon calling `run` or `chat`. There are
|
||||
exactly as many bullet points as there are tools in `agent.toolbox` and each bullet point consists of the name
|
||||
and description of the tool:
|
||||
|
||||
```text
|
||||
- <tool.name>: <tool.description>
|
||||
```
|
||||
|
||||
Let's verify this quickly by loading the document_qa tool and printing out the name and description.
|
||||
|
||||
```py
|
||||
from transformers import load_tool
|
||||
|
||||
document_qa = load_tool("document-question-answering")
|
||||
print(f"- {document_qa.name}: {document_qa.description}")
|
||||
```
|
||||
|
||||
which gives:
|
||||
```text
|
||||
- document_qa: This is a tool that answers a question about a document (pdf). It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. It returns a text that contains the answer to the question.
|
||||
```
|
||||
|
||||
We can see that the tool name is short and precise. The description includes two parts, the first explaining
|
||||
what the tool does and the second states what input arguments and return values are expected.
|
||||
|
||||
A good tool name and tool description are very important for the agent to correctly use it. Note that the only
|
||||
information the agent has about the tool is its name and description, so one should make sure that both
|
||||
are precisely written and match the style of the existing tools in the toolbox. In particular make sure the description
|
||||
mentions all the arguments expected by name in code-style, along with the expected type and a description of what they
|
||||
are.
|
||||
|
||||
<Tip>
|
||||
|
||||
Check the naming and description of the curated Transformers tools to better understand what name and
|
||||
description a tool is expected to have. You can see all tools with the [`Agent.toolbox`] property.
|
||||
|
||||
</Tip>
|
||||
|
||||
The third part includes a set of curated examples that show the agent exactly what code it should produce
|
||||
for what kind of user request. The large language models empowering the agent are extremely good at
|
||||
recognizing patterns in a prompt and repeating the pattern with new data. Therefore, it is very important
|
||||
that the examples are written in a way that maximizes the likelihood of the agent to generating correct,
|
||||
executable code in practice.
|
||||
|
||||
Let's have a look at one example:
|
||||
|
||||
````text
|
||||
Task: "Identify the oldest person in the `document` and create an image showcasing the result as a banner."
|
||||
|
||||
I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
|
||||
|
||||
Answer:
|
||||
```py
|
||||
answer = document_qa(document, question="What is the oldest person?")
|
||||
print(f"The answer is {answer}.")
|
||||
image = image_generator("A banner showing " + answer)
|
||||
```
|
||||
|
||||
````
|
||||
|
||||
The pattern the model is prompted to repeat has three parts: The task statement, the agent's explanation of
|
||||
what it intends to do, and finally the generated code. Every example that is part of the prompt has this exact
|
||||
pattern, thus making sure that the agent will reproduce exactly the same pattern when generating new tokens.
|
||||
|
||||
The prompt examples are curated by the Transformers team and rigorously evaluated on a set of
|
||||
[problem statements](https://github.com/huggingface/transformers/blob/main/src/transformers/tools/evaluate_agent.py)
|
||||
to ensure that the agent's prompt is as good as possible to solve real use cases of the agent.
|
||||
|
||||
The final part of the prompt corresponds to:
|
||||
```text
|
||||
Task: "Draw me a picture of rivers and lakes"
|
||||
|
||||
I will use the following
|
||||
```
|
||||
|
||||
is a final and unfinished example that the agent is tasked to complete. The unfinished example
|
||||
is dynamically created based on the actual user input. For the above example, the user ran:
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of rivers and lakes")
|
||||
```
|
||||
|
||||
The user input - *a.k.a* the task: *"Draw me a picture of rivers and lakes"* is cast into the
|
||||
prompt template: "Task: <task> \n\n I will use the following". This sentence makes up the final lines of the
|
||||
prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example
|
||||
exactly in the same way it was previously done in the examples.
|
||||
|
||||
Without going into too much detail, the chat template has the same prompt structure with the
|
||||
examples having a slightly different style, *e.g.*:
|
||||
|
||||
````text
|
||||
[...]
|
||||
|
||||
=====
|
||||
|
||||
Human: Answer the question in the variable `question` about the image stored in the variable `image`.
|
||||
|
||||
Assistant: I will use the tool `image_qa` to answer the question on the input image.
|
||||
|
||||
```py
|
||||
answer = image_qa(text=question, image=image)
|
||||
print(f"The answer is {answer}")
|
||||
```
|
||||
|
||||
Human: I tried this code, it worked but didn't give me a good result. The question is in French
|
||||
|
||||
Assistant: In this case, the question needs to be translated first. I will use the tool `translator` to do this.
|
||||
|
||||
```py
|
||||
translated_question = translator(question=question, src_lang="French", tgt_lang="English")
|
||||
print(f"The translated question is {translated_question}.")
|
||||
answer = image_qa(text=translated_question, image=image)
|
||||
print(f"The answer is {answer}")
|
||||
```
|
||||
|
||||
=====
|
||||
|
||||
[...]
|
||||
````
|
||||
|
||||
Contrary, to the examples of the `run` prompt, each `chat` prompt example has one or more exchanges between the
|
||||
*Human* and the *Assistant*. Every exchange is structured similarly to the example of the `run` prompt.
|
||||
The user's input is appended to behind *Human:* and the agent is prompted to first generate what needs to be done
|
||||
before generating code. An exchange can be based on previous exchanges, therefore allowing the user to refer
|
||||
to past exchanges as is done *e.g.* above by the user's input of "I tried **this** code" refers to the
|
||||
previously generated code of the agent.
|
||||
|
||||
Upon running `.chat`, the user's input or *task* is cast into an unfinished example of the form:
|
||||
```text
|
||||
Human: <user-input>\n\nAssistant:
|
||||
```
|
||||
which the agent completes. Contrary to the `run` command, the `chat` command then appends the completed example
|
||||
to the prompt, thus giving the agent more context for the next `chat` turn.
|
||||
|
||||
Great now that we know how the prompt is structured, let's see how we can customize it!
|
||||
|
||||
### Writing good user inputs
|
||||
|
||||
While large language models are getting better and better at understanding users' intentions, it helps
|
||||
enormously to be as precise as possible to help the agent pick the correct task. What does it mean to be
|
||||
as precise as possible?
|
||||
|
||||
The agent sees a list of tool names and their description in its prompt. The more tools are added the
|
||||
more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose
|
||||
the correct sequences of tools to run. Let's look at a common failure case, here we will only return
|
||||
the code to analyze it.
|
||||
|
||||
```py
|
||||
from transformers import HfAgent
|
||||
|
||||
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
|
||||
|
||||
agent.run("Show me a tree", return_code=True)
|
||||
```
|
||||
|
||||
gives:
|
||||
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tool: `image_segmenter` to create a segmentation mask for the image.
|
||||
|
||||
|
||||
==Code generated by the agent==
|
||||
mask = image_segmenter(image, prompt="tree")
|
||||
```
|
||||
|
||||
which is probably not what we wanted. Instead, it is more likely that we want an image of a tree to be generated.
|
||||
To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that
|
||||
are present in the tool's name and description. Let's have a look.
|
||||
```py
|
||||
agent.toolbox["image_generator"].description
|
||||
```
|
||||
|
||||
```text
|
||||
'This is a tool that creates an image according to a prompt, which is a text description. It takes an input named `prompt` which contains the image description and outputs an image.
|
||||
```
|
||||
|
||||
The name and description make use of the keywords "image", "prompt", "create" and "generate". Using these words will most likely work better here. Let's refine our prompt a bit.
|
||||
|
||||
```py
|
||||
agent.run("Create an image of a tree", return_code=True)
|
||||
```
|
||||
|
||||
gives:
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tool `image_generator` to generate an image of a tree.
|
||||
|
||||
|
||||
==Code generated by the agent==
|
||||
image = image_generator(prompt="tree")
|
||||
```
|
||||
|
||||
Much better! That looks more like what we want. In short, when you notice that the agent struggles to
|
||||
correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name
|
||||
and description and try refining your task request with it.
|
||||
|
||||
### Customizing the tool descriptions
|
||||
|
||||
As we've seen before the agent has access to each of the tools' names and descriptions. The base tools
|
||||
should have very precise names and descriptions, however, you might find that it could help to change the
|
||||
the description or name of a tool for your specific use case. This might become especially important
|
||||
when you've added multiple tools that are very similar or if you want to use your agent only for a certain
|
||||
domain, *e.g.* image generation and transformations.
|
||||
|
||||
A common problem is that the agent confuses image generation with image transformation/modification when
|
||||
used a lot for image generation tasks, *e.g.*
|
||||
```py
|
||||
agent.run("Make an image of a house and a car", return_code=True)
|
||||
```
|
||||
returns
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tools `image_generator` to generate an image of a house and `image_transformer` to transform the image of a car into the image of a house.
|
||||
|
||||
==Code generated by the agent==
|
||||
house_image = image_generator(prompt="A house")
|
||||
car_image = image_generator(prompt="A car")
|
||||
house_car_image = image_transformer(image=car_image, prompt="A house")
|
||||
```
|
||||
|
||||
which is probably not exactly what we want here. It seems like the agent has a difficult time
|
||||
to understand the difference between `image_generator` and `image_transformer` and often uses the two together.
|
||||
|
||||
We can help the agent here by changing the tool name and description of `image_transformer`. Let's instead call it `modifier`
|
||||
to disassociate it a bit from "image" and "prompt":
|
||||
```py
|
||||
agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer")
|
||||
agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace(
|
||||
"transforms an image according to a prompt", "modifies an image"
|
||||
)
|
||||
```
|
||||
|
||||
Now "modify" is a strong cue to use the new image processor which should help with the above prompt. Let's run it again.
|
||||
|
||||
```py
|
||||
agent.run("Make an image of a house and a car", return_code=True)
|
||||
```
|
||||
|
||||
Now we're getting:
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tools: `image_generator` to generate an image of a house, then `image_generator` to generate an image of a car.
|
||||
|
||||
|
||||
==Code generated by the agent==
|
||||
house_image = image_generator(prompt="A house")
|
||||
car_image = image_generator(prompt="A car")
|
||||
```
|
||||
|
||||
which is definitely closer to what we had in mind! However, we want to have both the house and car in the same image. Steering the task more toward single image generation should help:
|
||||
|
||||
```py
|
||||
agent.run("Create image: 'A house and car'", return_code=True)
|
||||
```
|
||||
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tool: `image_generator` to generate an image.
|
||||
|
||||
|
||||
==Code generated by the agent==
|
||||
image = image_generator(prompt="A house and car")
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Agents are still brittle for many use cases, especially when it comes to
|
||||
slightly more complex use cases like generating an image of multiple objects.
|
||||
Both the agent itself and the underlying prompt will be further improved in the coming
|
||||
months making sure that agents become more robust to a variety of user inputs.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Customizing the whole prompt
|
||||
|
||||
To give the user maximum flexibility, the whole prompt template as explained in [above](#structure-of-the-prompt)
|
||||
can be overwritten by the user. In this case make sure that your custom prompt includes an introduction section,
|
||||
a tool section, an example section, and an unfinished example section. If you want to overwrite the `run` prompt template,
|
||||
you can do as follows:
|
||||
|
||||
```py
|
||||
template = """ [...] """
|
||||
|
||||
agent = HfAgent(your_endpoint, run_prompt_template=template)
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Please make sure to have the `<<all_tools>>` string and the `<<prompt>>` defined somewhere in the `template` so that the agent can be aware
|
||||
of the tools, it has available to it as well as correctly insert the user's prompt.
|
||||
|
||||
</Tip>
|
||||
|
||||
Similarly, one can overwrite the `chat` prompt template. Note that the `chat` mode always uses the following format for the exchanges:
|
||||
```text
|
||||
Human: <<task>>
|
||||
|
||||
Assistant:
|
||||
```
|
||||
|
||||
Therefore it is important that the examples of the custom `chat` prompt template also make use of this format.
|
||||
You can overwrite the `chat` template at instantiation as follows.
|
||||
|
||||
```
|
||||
template = """ [...] """
|
||||
|
||||
agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template)
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Please make sure to have the `<<all_tools>>` string defined somewhere in the `template` so that the agent can be aware
|
||||
of the tools, it has available to it.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Using custom tools
|
||||
|
||||
In this section, we'll be leveraging two existing custom tools that are specific to image generation:
|
||||
|
||||
- We replace [huggingface-tools/image-transformation](https://huggingface.co/spaces/huggingface-tools/image-transformation),
|
||||
with [diffusers/controlnet-canny-tool](https://huggingface.co/spaces/diffusers/controlnet-canny-tool)
|
||||
to allow for more image modifications.
|
||||
- We add a new tool for image upscaling to the default toolbox:
|
||||
[diffusers/latent-upscaler-tool](https://huggingface.co/spaces/diffusers/latent-upscaler-tool) replace the existing image-transformation tool.
|
||||
|
||||
We'll start by loading the custom tools with the convenient [`load_tool`] function:
|
||||
|
||||
```py
|
||||
from transformers import load_tool
|
||||
|
||||
controlnet_transformer = load_tool("diffusers/controlnet-canny-tool")
|
||||
upscaler = load_tool("diffusers/latent-upscaler-tool")
|
||||
```
|
||||
|
||||
Upon adding custom tools to an agent, the tools' descriptions and names are automatically
|
||||
included in the agents' prompts. Thus, it is imperative that custom tools have
|
||||
a well-written description and name in order for the agent to understand how to use them.
|
||||
Let's take a look at the description and name of `controlnet_transformer`:
|
||||
|
||||
```py
|
||||
print(f"Description: '{controlnet_transformer.description}'")
|
||||
print(f"Name: '{controlnet_transformer.name}'")
|
||||
```
|
||||
|
||||
gives
|
||||
```text
|
||||
Description: 'This is a tool that transforms an image with ControlNet according to a prompt.
|
||||
It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. It returns the modified image.'
|
||||
Name: 'image_transformer'
|
||||
```
|
||||
|
||||
The name and description are accurate and fit the style of the [curated set of tools](./transformers_agents#a-curated-set-of-tools).
|
||||
Next, let's instantiate an agent with `controlnet_transformer` and `upscaler`:
|
||||
|
||||
```py
|
||||
tools = [controlnet_transformer, upscaler]
|
||||
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools)
|
||||
```
|
||||
|
||||
This command should give you the following info:
|
||||
|
||||
```text
|
||||
image_transformer has been replaced by <transformers_modules.diffusers.controlnet-canny-tool.bd76182c7777eba9612fc03c0
|
||||
8718a60c0aa6312.image_transformation.ControlNetTransformationTool object at 0x7f1d3bfa3a00> as provided in `additional_tools`
|
||||
```
|
||||
|
||||
The set of curated tools already has an `image_transformer` tool which is hereby replaced with our custom tool.
|
||||
|
||||
<Tip>
|
||||
|
||||
Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool
|
||||
because the agent is well-versed in using the specific task. Beware that the custom tool should follow the exact same API
|
||||
as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that
|
||||
tool are updated.
|
||||
|
||||
</Tip>
|
||||
|
||||
The upscaler tool was given the name `image_upscaler` which is not yet present in the default toolbox and is therefore simply added to the list of tools.
|
||||
You can always have a look at the toolbox that is currently available to the agent via the `agent.toolbox` attribute:
|
||||
|
||||
```py
|
||||
print("\n".join([f"- {a}" for a in agent.toolbox.keys()]))
|
||||
```
|
||||
|
||||
```text
|
||||
- document_qa
|
||||
- image_captioner
|
||||
- image_qa
|
||||
- image_segmenter
|
||||
- transcriber
|
||||
- summarizer
|
||||
- text_classifier
|
||||
- text_qa
|
||||
- text_reader
|
||||
- translator
|
||||
- image_transformer
|
||||
- text_downloader
|
||||
- image_generator
|
||||
- video_generator
|
||||
- image_upscaler
|
||||
```
|
||||
|
||||
Note how `image_upscaler` is now part of the agents' toolbox.
|
||||
|
||||
Let's now try out the new tools! We will re-use the image we generated in [Transformers Agents Quickstart](./transformers_agents#single-execution-run).
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"
|
||||
)
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
|
||||
|
||||
Let's transform the image into a beautiful winter landscape:
|
||||
|
||||
```py
|
||||
image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image)
|
||||
```
|
||||
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tool: `image_transformer` to transform the image.
|
||||
|
||||
|
||||
==Code generated by the agent==
|
||||
image = image_transformer(image, prompt="A frozen lake and snowy forest")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter.png" width=200>
|
||||
|
||||
The new image processing tool is based on ControlNet which can make very strong modifications to the image.
|
||||
By default the image processing tool returns an image of size 512x512 pixels. Let's see if we can upscale it.
|
||||
|
||||
```py
|
||||
image = agent.run("Upscale the image", image)
|
||||
```
|
||||
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tool: `image_upscaler` to upscale the image.
|
||||
|
||||
|
||||
==Code generated by the agent==
|
||||
upscaled_image = image_upscaler(image)
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_winter_upscale.png" width=400>
|
||||
|
||||
The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool
|
||||
and was able to correctly run it.
|
||||
|
||||
Next, let's have a look at how you can create a new custom tool.
|
||||
|
||||
### Adding new tools
|
||||
|
||||
In this section, we show how to create a new tool that can be added to the agent.
|
||||
|
||||
#### Creating a new tool
|
||||
|
||||
We'll first start by creating a tool. We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face
|
||||
Hub with the most downloads for a given task.
|
||||
|
||||
We can do that with the following code:
|
||||
|
||||
```python
|
||||
from huggingface_hub import list_models
|
||||
|
||||
task = "text-classification"
|
||||
|
||||
model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
|
||||
print(model.id)
|
||||
```
|
||||
|
||||
For the task `text-classification`, this returns `'facebook/bart-large-mnli'`, for `translation` it returns `'t5-base`.
|
||||
|
||||
How do we convert this to a tool that the agent can leverage? All tools depend on the superclass `Tool` that holds the
|
||||
main attributes necessary. We'll create a class that inherits from it:
|
||||
|
||||
```python
|
||||
from transformers import Tool
|
||||
|
||||
|
||||
class HFModelDownloadsTool(Tool):
|
||||
pass
|
||||
```
|
||||
|
||||
This class has a few needs:
|
||||
- An attribute `name`, which corresponds to the name of the tool itself. To be in tune with other tools which have a
|
||||
performative name, we'll name it `model_download_counter`.
|
||||
- An attribute `description`, which will be used to populate the prompt of the agent.
|
||||
- `inputs` and `outputs` attributes. Defining this will help the python interpreter make educated choices about types,
|
||||
and will allow for a gradio-demo to be spawned when we push our tool to the Hub. They're both a list of expected
|
||||
values, which can be `text`, `image`, or `audio`.
|
||||
- A `__call__` method which contains the inference code. This is the code we've played with above!
|
||||
|
||||
Here's what our class looks like now:
|
||||
|
||||
```python
|
||||
from transformers import Tool
|
||||
from huggingface_hub import list_models
|
||||
|
||||
|
||||
class HFModelDownloadsTool(Tool):
|
||||
name = "model_download_counter"
|
||||
description = (
|
||||
"This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
|
||||
"It takes the name of the category (such as text-classification, depth-estimation, etc), and "
|
||||
"returns the name of the checkpoint."
|
||||
)
|
||||
|
||||
inputs = ["text"]
|
||||
outputs = ["text"]
|
||||
|
||||
def __call__(self, task: str):
|
||||
model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
|
||||
return model.id
|
||||
```
|
||||
|
||||
We now have our tool handy. Save it in a file and import it from your main script. Let's name this file
|
||||
`model_downloads.py`, so the resulting import code looks like this:
|
||||
|
||||
```python
|
||||
from model_downloads import HFModelDownloadsTool
|
||||
|
||||
tool = HFModelDownloadsTool()
|
||||
```
|
||||
|
||||
In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your
|
||||
namespace. To do so, just call `push_to_hub` on the `tool` variable:
|
||||
|
||||
```python
|
||||
tool.push_to_hub("hf-model-downloads")
|
||||
```
|
||||
|
||||
You now have your code on the Hub! Let's take a look at the final step, which is to have the agent use it.
|
||||
|
||||
#### Having the agent use the tool
|
||||
|
||||
We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool):
|
||||
|
||||
```python
|
||||
from transformers import load_tool
|
||||
|
||||
tool = load_tool("lysandre/hf-model-downloads")
|
||||
```
|
||||
|
||||
In order to use it in the agent, simply pass it in the `additional_tools` parameter of the agent initialization method:
|
||||
|
||||
```python
|
||||
from transformers import HfAgent
|
||||
|
||||
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
|
||||
|
||||
agent.run(
|
||||
"Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
|
||||
)
|
||||
```
|
||||
which outputs the following:
|
||||
```text
|
||||
==Code generated by the agent==
|
||||
model = model_download_counter(task="text-to-video")
|
||||
print(f"The model with the most downloads is {model}.")
|
||||
audio_model = text_reader(model)
|
||||
|
||||
|
||||
==Result==
|
||||
The model with the most downloads is damo-vilab/text-to-video-ms-1.7b.
|
||||
```
|
||||
|
||||
and generates the following audio.
|
||||
|
||||
| **Audio** |
|
||||
|------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. Having a well-defined
|
||||
name and description of the tool is paramount to having it be leveraged by the agent.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Replacing existing tools
|
||||
|
||||
Replacing existing tools can be done simply by assigning a new item to the agent's toolbox. Here's how one would do so:
|
||||
|
||||
```python
|
||||
from transformers import HfAgent, load_tool
|
||||
|
||||
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
|
||||
agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool")
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Beware when replacing tools with others! This will also adjust the agent's prompt. This can be good if you have a better
|
||||
prompt suited for the task, but it can also result in your tool being selected way more than others or for other
|
||||
tools to be selected instead of the one you have defined.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Leveraging gradio-tools
|
||||
|
||||
[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
|
||||
Face Spaces as tools. It supports many existing Spaces as well as custom Spaces to be designed with it.
|
||||
|
||||
We offer support for `gradio_tools` by using the `Tool.from_gradio` method. For example, we want to take
|
||||
advantage of the `StableDiffusionPromptGeneratorTool` tool offered in the `gradio-tools` toolkit so as to
|
||||
improve our prompts and generate better images.
|
||||
|
||||
We first import the tool from `gradio_tools` and instantiate it:
|
||||
|
||||
```python
|
||||
from gradio_tools import StableDiffusionPromptGeneratorTool
|
||||
|
||||
gradio_tool = StableDiffusionPromptGeneratorTool()
|
||||
```
|
||||
|
||||
We pass that instance to the `Tool.from_gradio` method:
|
||||
|
||||
```python
|
||||
from transformers import Tool
|
||||
|
||||
tool = Tool.from_gradio(gradio_tool)
|
||||
```
|
||||
|
||||
Now we can manage it exactly as we would a usual custom tool. We leverage it to improve our prompt
|
||||
` a rabbit wearing a space suit`:
|
||||
|
||||
```python
|
||||
from transformers import HfAgent
|
||||
|
||||
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool])
|
||||
|
||||
agent.run("Generate an image of the `prompt` after improving it.", prompt="A rabbit wearing a space suit")
|
||||
```
|
||||
|
||||
The model adequately leverages the tool:
|
||||
```text
|
||||
==Explanation from the agent==
|
||||
I will use the following tools: `StableDiffusionPromptGenerator` to improve the prompt, then `image_generator` to generate an image according to the improved prompt.
|
||||
|
||||
|
||||
==Code generated by the agent==
|
||||
improved_prompt = StableDiffusionPromptGenerator(prompt)
|
||||
print(f"The improved prompt is {improved_prompt}.")
|
||||
image = image_generator(improved_prompt)
|
||||
```
|
||||
|
||||
Before finally generating the image:
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
gradio-tools requires *textual* inputs and outputs, even when working with different modalities. This implementation
|
||||
works with image and audio objects. The two are currently incompatible, but will rapidly become compatible as we
|
||||
work to improve the support.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Future compatibility with Langchain
|
||||
|
||||
We love Langchain and think it has a very compelling suite of tools. In order to handle these tools,
|
||||
Langchain requires *textual* inputs and outputs, even when working with different modalities.
|
||||
This is often the serialized version (i.e., saved to disk) of the objects.
|
||||
|
||||
This difference means that multi-modality isn't handled between transformers-agents and langchain.
|
||||
We aim for this limitation to be resolved in future versions, and welcome any help from avid langchain
|
||||
users to help us achieve this compatibility.
|
||||
|
||||
We would love to have better support. If you would like to help, please
|
||||
[open an issue](https://github.com/huggingface/transformers/issues/new) and share what you have in mind.
|
@ -75,7 +75,7 @@ The documentation is organized into five sections:
|
||||
1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation]https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
|
||||
1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
@ -174,7 +174,6 @@ The documentation is organized into five sections:
|
||||
1. **[NLLB-MOE](model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
@ -196,7 +195,6 @@ The documentation is organized into five sections:
|
||||
1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
@ -206,7 +204,6 @@ The documentation is organized into five sections:
|
||||
1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
@ -376,7 +373,6 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| OneFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| OpenLlama | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| OPT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| OWL-ViT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
@ -398,7 +394,6 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| RoBERTa-PreLayerNorm | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| RoCBert | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| RWKV | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| SAM | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| SegFormer | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| SEW | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
@ -409,7 +404,6 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| SpeechT5 | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SwiftFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Swin Transformer V2 | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Swin2SR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
|
@ -12,9 +12,10 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Utilities for `FeatureExtractors`
|
||||
|
||||
This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *log mel spectrogram*.
|
||||
This page lists all the utility functions that can be used by the audio [`FeatureExtractor`] in order to compute special features from a raw audio using common algorithms such as *Short Time Fourier Transform* or *Mel log spectrogram*.
|
||||
|
||||
Most of those are only useful if you are studying the code of the audio processors in the library.
|
||||
|
||||
Most of those are only useful if you are studying the code of the image processors in the library.
|
||||
|
||||
## Audio Transformations
|
||||
|
||||
@ -22,14 +23,12 @@ Most of those are only useful if you are studying the code of the audio processo
|
||||
|
||||
[[autodoc]] audio_utils.mel_to_hertz
|
||||
|
||||
[[autodoc]] audio_utils.mel_filter_bank
|
||||
[[autodoc]] audio_utils.get_mel_filter_banks
|
||||
|
||||
[[autodoc]] audio_utils.optimal_fft_length
|
||||
|
||||
[[autodoc]] audio_utils.window_function
|
||||
|
||||
[[autodoc]] audio_utils.spectrogram
|
||||
[[autodoc]] audio_utils.stft
|
||||
|
||||
[[autodoc]] audio_utils.power_to_db
|
||||
|
||||
[[autodoc]] audio_utils.amplitude_to_db
|
||||
[[autodoc]] audio_utils.fram_wave
|
||||
|
||||
|
||||
|
@ -1,64 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Agents & Tools
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
|
||||
can vary as the APIs or underlying models are prone to change.
|
||||
|
||||
</Tip>
|
||||
|
||||
To learn more about agents and tools make sure to read the [introductory guide](../transformers_agents). This page
|
||||
contains the API docs for the underlying classes.
|
||||
|
||||
## Agents
|
||||
|
||||
We provide two types of agents: [`HfAgent`] uses inference endpoints for opensource models and [`OpenAiAgent`] uses OpenAI closed models.
|
||||
|
||||
### HfAgent
|
||||
|
||||
[[autodoc]] HfAgent
|
||||
|
||||
### OpenAiAgent
|
||||
|
||||
[[autodoc]] OpenAiAgent
|
||||
|
||||
### Agent
|
||||
|
||||
[[autodoc]] Agent
|
||||
- chat
|
||||
- run
|
||||
- prepare_for_new_chat
|
||||
|
||||
## Tools
|
||||
|
||||
### load_tool
|
||||
|
||||
[[autodoc]] load_tool
|
||||
|
||||
### Tool
|
||||
|
||||
[[autodoc]] Tool
|
||||
|
||||
### PipelineTool
|
||||
|
||||
[[autodoc]] PipelineTool
|
||||
|
||||
### RemoteTool
|
||||
|
||||
[[autodoc]] RemoteTool
|
||||
|
||||
### launch_gradio_demo
|
||||
|
||||
[[autodoc]] launch_gradio_demo
|
@ -31,7 +31,7 @@ outputs = model(**inputs, labels=labels)
|
||||
```
|
||||
|
||||
The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the
|
||||
documentation of that class below, it means it has an optional `loss`, a `logits`, an optional `hidden_states` and
|
||||
documentation of that class below, it means it has an optional `loss`, a `logits` an optional `hidden_states` and
|
||||
an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have
|
||||
`hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or
|
||||
`output_attentions=True`.
|
||||
|
315
docs/source/en/migration.mdx
Normal file
315
docs/source/en/migration.mdx
Normal file
@ -0,0 +1,315 @@
|
||||
<!---
|
||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
# Migrating from previous packages
|
||||
|
||||
## Migrating from transformers `v3.x` to `v4.x`
|
||||
|
||||
A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
|
||||
expected changes:
|
||||
|
||||
#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
|
||||
|
||||
The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set.
|
||||
|
||||
This introduces two breaking changes:
|
||||
- The handling of overflowing tokens between the python and rust tokenizers is different.
|
||||
- The rust tokenizers do not accept integers in the encoding methods.
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](main_classes/pipelines#transformers.TokenClassificationPipeline).
|
||||
- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
|
||||
|
||||
In version `v3.x`:
|
||||
```py
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```py
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
|
||||
```
|
||||
|
||||
#### 2. SentencePiece is removed from the required dependencies
|
||||
|
||||
The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
|
||||
|
||||
This includes the **slow** versions of:
|
||||
- `XLNetTokenizer`
|
||||
- `AlbertTokenizer`
|
||||
- `CamembertTokenizer`
|
||||
- `MBartTokenizer`
|
||||
- `PegasusTokenizer`
|
||||
- `T5Tokenizer`
|
||||
- `ReformerTokenizer`
|
||||
- `XLMRobertaTokenizer`
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
|
||||
|
||||
In version `v3.x`:
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```bash
|
||||
pip install transformers[sentencepiece]
|
||||
```
|
||||
or
|
||||
```bash
|
||||
pip install transformers sentencepiece
|
||||
```
|
||||
#### 3. The architecture of the repo has been updated so that each model resides in its folder
|
||||
|
||||
The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
|
||||
|
||||
This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers.
|
||||
|
||||
In version `v3.x`:
|
||||
```bash
|
||||
from transformers.modeling_bert import BertLayer
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```bash
|
||||
from transformers.models.bert.modeling_bert import BertLayer
|
||||
```
|
||||
|
||||
#### 4. Switching the `return_dict` argument to `True` by default
|
||||
|
||||
The [`return_dict` argument](main_classes/output) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
|
||||
|
||||
This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
|
||||
|
||||
In version `v3.x`:
|
||||
```bash
|
||||
model = BertModel.from_pretrained("bert-base-cased")
|
||||
outputs = model(**inputs)
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```bash
|
||||
model = BertModel.from_pretrained("bert-base-cased")
|
||||
outputs = model(**inputs, return_dict=False)
|
||||
```
|
||||
or
|
||||
```bash
|
||||
model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
|
||||
outputs = model(**inputs)
|
||||
```
|
||||
|
||||
#### 5. Removed some deprecated attributes
|
||||
|
||||
Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
|
||||
|
||||
Here is a list of these attributes/methods/arguments and what their replacements should be:
|
||||
|
||||
In several models, the labels become consistent with the other models:
|
||||
- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
|
||||
- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
|
||||
- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
|
||||
- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
|
||||
- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
|
||||
- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
|
||||
- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
|
||||
|
||||
In several models, the caching mechanism becomes consistent with the other models:
|
||||
- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
|
||||
- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
|
||||
- `past` becomes `past_key_values` in all CTRL models.
|
||||
- `past` becomes `past_key_values` in all GPT-2 models.
|
||||
|
||||
Regarding the tokenizer classes:
|
||||
- The tokenizer attribute `max_len` becomes `model_max_length`.
|
||||
- The tokenizer attribute `return_lengths` becomes `return_length`.
|
||||
- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
|
||||
|
||||
Regarding the `Trainer` class:
|
||||
- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
|
||||
- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
|
||||
- The `Trainer` attribute `data_collator` should be a callable.
|
||||
- The `Trainer` method `_log` is deprecated in favor of `log`.
|
||||
- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
|
||||
- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
|
||||
- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
|
||||
- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
|
||||
|
||||
Regarding the `TFTrainer` class:
|
||||
- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
|
||||
- The `Trainer` method `_log` is deprecated in favor of `log`.
|
||||
- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
|
||||
- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
|
||||
- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
|
||||
|
||||
Regarding the `TrainingArguments` class:
|
||||
- The `TrainingArguments` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
|
||||
|
||||
Regarding the Transfo-XL model:
|
||||
- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
|
||||
- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
|
||||
|
||||
Regarding pipelines:
|
||||
- The `FillMaskPipeline` argument `topk` becomes `top_k`.
|
||||
|
||||
|
||||
|
||||
## Migrating from pytorch-transformers to 🤗 Transformers
|
||||
|
||||
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
|
||||
|
||||
### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
|
||||
|
||||
To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
|
||||
|
||||
If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
|
||||
|
||||
If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
|
||||
|
||||
## Migrating from pytorch-pretrained-bert
|
||||
|
||||
Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to 🤗 Transformers
|
||||
|
||||
### Models always output `tuples`
|
||||
|
||||
The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
||||
|
||||
The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||
|
||||
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
||||
|
||||
Here is a `pytorch-pretrained-bert` to 🤗 Transformers conversion example for a `BertForSequenceClassification` classification model:
|
||||
|
||||
```python
|
||||
# Let's load our model
|
||||
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
|
||||
|
||||
# If you used to have this line in pytorch-pretrained-bert:
|
||||
loss = model(input_ids, labels=labels)
|
||||
|
||||
# Now just use this line in 🤗 Transformers to extract the loss from the output tuple:
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss = outputs[0]
|
||||
|
||||
# In 🤗 Transformers you can also have access to the logits:
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
|
||||
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", output_attentions=True)
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits, attentions = outputs
|
||||
```
|
||||
|
||||
### Serialization
|
||||
|
||||
Breaking change in the `from_pretrained()`method:
|
||||
|
||||
1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
|
||||
|
||||
2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method.
|
||||
|
||||
Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
|
||||
|
||||
Here is an example:
|
||||
|
||||
```python
|
||||
### Let's load a model and tokenizer
|
||||
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
### Do some stuff to our model and tokenizer
|
||||
# Ex: add new tokens to the vocabulary and embeddings of our model
|
||||
tokenizer.add_tokens(["[SPECIAL_TOKEN_1]", "[SPECIAL_TOKEN_2]"])
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
# Train our model
|
||||
train(model)
|
||||
|
||||
### Now let's save our model and tokenizer to a directory
|
||||
model.save_pretrained("./my_saved_model_directory/")
|
||||
tokenizer.save_pretrained("./my_saved_model_directory/")
|
||||
|
||||
### Reload the model and the tokenizer
|
||||
model = BertForSequenceClassification.from_pretrained("./my_saved_model_directory/")
|
||||
tokenizer = BertTokenizer.from_pretrained("./my_saved_model_directory/")
|
||||
```
|
||||
|
||||
### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
|
||||
|
||||
The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
|
||||
|
||||
- it only implements weights decay correction,
|
||||
- schedules are now externals (see below),
|
||||
- gradient clipping is now also external (see below).
|
||||
|
||||
The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
|
||||
|
||||
The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
|
||||
|
||||
Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
|
||||
|
||||
```python
|
||||
# Parameters:
|
||||
lr = 1e-3
|
||||
max_grad_norm = 1.0
|
||||
num_training_steps = 1000
|
||||
num_warmup_steps = 100
|
||||
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
|
||||
|
||||
### Previously BertAdam optimizer was instantiated like this:
|
||||
optimizer = BertAdam(
|
||||
model.parameters(),
|
||||
lr=lr,
|
||||
schedule="warmup_linear",
|
||||
warmup=warmup_proportion,
|
||||
num_training_steps=num_training_steps,
|
||||
)
|
||||
### and used like this:
|
||||
for batch in train_data:
|
||||
loss = model(batch)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
|
||||
optimizer = AdamW(
|
||||
model.parameters(), lr=lr, correct_bias=False
|
||||
) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
|
||||
) # PyTorch scheduler
|
||||
### and used like this:
|
||||
for batch in train_data:
|
||||
loss = model(batch)
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
model.parameters(), max_grad_norm
|
||||
) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
```
|
@ -54,15 +54,8 @@ This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The
|
||||
|
||||
[[autodoc]] BioGptForCausalLM
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForTokenClassification
|
||||
|
||||
[[autodoc]] BioGptForTokenClassification
|
||||
- forward
|
||||
|
||||
|
||||
## BioGptForSequenceClassification
|
||||
|
||||
[[autodoc]] BioGptForSequenceClassification
|
||||
- forward
|
@ -19,16 +19,13 @@ specific language governing permissions and limitations under the License.
|
||||
<a href="https://huggingface.co/spaces/docs-demos/distilbert-base-uncased">
|
||||
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/papers/1910.01108">
|
||||
<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.01108-green">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
|
||||
The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
|
||||
distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
|
||||
distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/papers/1910.01108). DistilBERT is a
|
||||
distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
|
||||
small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
|
||||
*bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
|
||||
understanding benchmark.
|
||||
|
@ -24,7 +24,7 @@ specific language governing permissions and limitations under the License.
|
||||
## Overview
|
||||
|
||||
OpenAI GPT-2 model was proposed in [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) by Alec
|
||||
Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever from [OpenAI](https://huggingface.co/openai). It's a causal (unidirectional)
|
||||
Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
|
||||
transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
@ -111,11 +111,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
[[autodoc]] GPT2DoubleHeadsModel
|
||||
- forward
|
||||
|
||||
## GPT2ForQuestionAnswering
|
||||
|
||||
[[autodoc]] GPT2ForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## GPT2ForSequenceClassification
|
||||
|
||||
[[autodoc]] GPT2ForSequenceClassification
|
||||
|
@ -69,11 +69,6 @@ The `generate()` method can be used to generate text using GPT Neo model.
|
||||
[[autodoc]] GPTNeoForCausalLM
|
||||
- forward
|
||||
|
||||
## GPTNeoForQuestionAnswering
|
||||
|
||||
[[autodoc]] GPTNeoForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## GPTNeoForSequenceClassification
|
||||
|
||||
[[autodoc]] GPTNeoForSequenceClassification
|
||||
|
@ -79,11 +79,6 @@ The `generate()` method can be used to generate text using GPT Neo model.
|
||||
[[autodoc]] GPTNeoXForCausalLM
|
||||
- forward
|
||||
|
||||
## GPTNeoXForQuestionAnswering
|
||||
|
||||
[[autodoc]] GPTNeoXForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## GPTNeoXForSequenceClassification
|
||||
|
||||
[[autodoc]] GPTNeoXForSequenceClassification
|
||||
|
@ -121,28 +121,6 @@ section below.
|
||||
In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
|
||||
[LayoutXLM's documentation page](layoutxlm).
|
||||
|
||||
## Resources
|
||||
|
||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
||||
<PipelineTag pipeline="text-classification"/>
|
||||
|
||||
- A notebook on how to [finetune LayoutLMv2 for text-classification on RVL-CDIP dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/RVL-CDIP/Fine_tuning_LayoutLMv2ForSequenceClassification_on_RVL_CDIP.ipynb).
|
||||
- See also: [Text classification task guide](../tasks/sequence_classification)
|
||||
|
||||
<PipelineTag pipeline="question-answering"/>
|
||||
|
||||
- A notebook on how to [finetune LayoutLMv2 for question-answering on DocVQA dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/DocVQA/Fine_tuning_LayoutLMv2ForQuestionAnswering_on_DocVQA.ipynb).
|
||||
- See also: [Question answering task guide](../tasks/question_answering)
|
||||
- See also: [Document question answering task guide](../tasks/document_question_answering)
|
||||
|
||||
|
||||
<PipelineTag pipeline="token-classification"/>
|
||||
|
||||
- A notebook on how to [finetune LayoutLMv2 for token-classification on CORD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/CORD/Fine_tuning_LayoutLMv2ForTokenClassification_on_CORD.ipynb).
|
||||
- A notebook on how to [finetune LayoutLMv2 for token-classification on FUNSD dataset](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb).
|
||||
- See also: [Token classification task guide](../tasks/token_classification)
|
||||
|
||||
## Usage: LayoutLMv2Processor
|
||||
|
||||
The easiest way to prepare data for the model is to use [`LayoutLMv2Processor`], which internally
|
||||
@ -288,6 +266,13 @@ print(encoding.keys())
|
||||
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
|
||||
```
|
||||
|
||||
## Documentation resources
|
||||
|
||||
- [Document question answering task guide](../tasks/document_question_answering)
|
||||
- [Text classification task guide](../tasks/sequence_classification)
|
||||
- [Token classification task guide](../tasks/token_classification)
|
||||
- [Question answering task guide](../tasks/question_answering)
|
||||
|
||||
## LayoutLMv2Config
|
||||
|
||||
[[autodoc]] LayoutLMv2Config
|
||||
|
@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
## Overview
|
||||
|
||||
The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
|
||||
The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](LLaMA: Open and Efficient Foundation Language Models) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
|
@ -1,44 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Open-Llama
|
||||
|
||||
## Overview
|
||||
|
||||
The Open-Llama model was proposed in [Open-Llama project](https://github.com/s-JoL/Open-Llama) by community developer s-JoL.
|
||||
|
||||
The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PLAM.
|
||||
And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
|
||||
|
||||
This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
|
||||
The original code can be found [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
Checkpoint and usage can be found at [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
|
||||
|
||||
|
||||
## OpenLlamaConfig
|
||||
|
||||
[[autodoc]] OpenLlamaConfig
|
||||
|
||||
## OpenLlamaModel
|
||||
|
||||
[[autodoc]] OpenLlamaModel
|
||||
- forward
|
||||
|
||||
## OpenLlamaForCausalLM
|
||||
|
||||
[[autodoc]] OpenLlamaForCausalLM
|
||||
- forward
|
||||
|
||||
## OpenLlamaForSequenceClassification
|
||||
|
||||
[[autodoc]] OpenLlamaForSequenceClassification
|
||||
- forward
|
@ -19,14 +19,11 @@ specific language governing permissions and limitations under the License.
|
||||
<a href="https://huggingface.co/spaces/docs-demos/roberta-base">
|
||||
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/papers/1907.11692">
|
||||
<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1907.11692-green">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
|
||||
The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
|
||||
The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
|
||||
Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.
|
||||
|
||||
It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
|
||||
|
@ -1,129 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# RWKV
|
||||
|
||||
## Overview
|
||||
|
||||
The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM)
|
||||
|
||||
It suggests a tweak in the traditional Transformer attention to make it linear. This way, the model can be used as recurrent network: passing inputs for timestamp 0 and timestamp 1 together is the same as passing inputs at timestamp 0, then inputs at timestamp 1 along with the state of timestamp 0 (see example below).
|
||||
|
||||
This can be more efficient than a regular Transformer and can deal with sentence of any length (even if the model uses a fixed context length for training).
|
||||
|
||||
This model was contributed by [sgugger](https://huggingface.co/sgugger).
|
||||
The original code can be found [here](https://github.com/BlinkDL/RWKV-LM).
|
||||
|
||||
Example of use as an RNN:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, RwkvConfig, RwkvModel
|
||||
|
||||
model = RwkvModel.from_pretrained("sgugger/rwkv-430M-pile")
|
||||
tokenizer = AutoTokenizer.from_pretrained("sgugger/rwkv-430M-pile")
|
||||
|
||||
inputs = tokenizer("This is an example.", return_tensors="pt")
|
||||
# Feed everything to the model
|
||||
outputs = model(inputs["input_ids"])
|
||||
output_whole = outputs.last_hidden_state
|
||||
|
||||
outputs = model(inputs["input_ids"][:, :2])
|
||||
output_one = outputs.last_hidden_state
|
||||
|
||||
# Using the state computed on the first inputs, we will get the same output
|
||||
outputs = model(inputs["input_ids"][:, 2:], state=outputs.state)
|
||||
output_two = outputs.last_hidden_state
|
||||
|
||||
torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5)
|
||||
```
|
||||
|
||||
## RwkvConfig
|
||||
|
||||
[[autodoc]] RwkvConfig
|
||||
|
||||
|
||||
## RwkvModel
|
||||
|
||||
[[autodoc]] RwkvModel
|
||||
- forward
|
||||
|
||||
## RwkvLMHeadModel
|
||||
|
||||
[[autodoc]] RwkvForCausalLM
|
||||
- forward
|
||||
|
||||
## Rwkv attention and the recurrent formulas
|
||||
|
||||
In a traditional auto-regressive Transformer, attention is written as
|
||||
|
||||
$$O = \hbox{softmax}(QK^{T} / \sqrt{d}) V$$
|
||||
|
||||
with \\(Q\\), \\(K\\) and \\(V\\) are matrices of shape `seq_len x hidden_size` named query, key and value (they are actually bigger matrices with a batch dimension and an attention head dimension but we're only interested in the last two, which is where the matrix product is taken, so for the sake of simplicity we only consider those two). The product \\(QK^{T}\\) then has shape `seq_len x seq_len` and we can take the maxtrix product with \\(V\\) to get the output \\(O\\) of the same shape as the others.
|
||||
|
||||
Replacing the softmax by its value gives:
|
||||
|
||||
$$O_{i} = \frac{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}} V_{j}}{\sum_{j=1}^{i} e^{Q_{i} K_{j}^{T} / \sqrt{d}}}$$
|
||||
|
||||
Note that the entries in \\(QK^{T}\\) corresponding to \\(j > i\\) are masked (the sum stops at j) because the attention is not allowed to look at future tokens (only past ones).
|
||||
|
||||
In comparison, the RWKV attention is given by
|
||||
|
||||
$$O_{i} = \sigma(R_{i}) \frac{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}} V_{j}}{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}}}$$
|
||||
|
||||
where \\(R\\) is a new matrix called receptance by the author, \\(K\\) and \\(V\\) are still the key and value (\\(\sigma\\) here is the sigmoid function). \\(W\\) is a new vector that represents the position of the token and is given by
|
||||
|
||||
$$W_{0} = u \hbox{ and } W_{k} = (k-1)w \hbox{ for } k \geq 1$$
|
||||
|
||||
with \\(u\\) and \\(w\\) learnable parameters called in the code `time_first` and `time_decay` respectively. The numerator and denominator can both be expressed recursively. Naming them \\(N_{i}\\) and \\(D_{i}\\) we have:
|
||||
|
||||
$$N_{i} = e^{u + K_{i}} V_{i} + \hat{N}_{i} \hbox{ where } \hat{N}_{i} = e^{K_{i-1}} V_{i-1} + e^{w + K_{i-2}} V_{i-2} \cdots + e^{(i-2)w + K_{1}} V_{1}$$
|
||||
|
||||
so \\(\hat{N}_{i}\\) (called `numerator_state` in the code) satistfies
|
||||
|
||||
$$\hat{N}_{0} = 0 \hbox{ and } \hat{N}_{j+1} = e^{K_{j}} V_{j} + e^{w} \hat{N}_{j}$$
|
||||
|
||||
and
|
||||
|
||||
$$D_{i} = e^{u + K_{i}} + \hat{D}_{i} \hbox{ where } \hat{D}_{i} = e^{K_{i-1}} + e^{w + K_{i-2}} \cdots + e^{(i-2)w + K_{1}}$$
|
||||
|
||||
so \\(\hat{D}_{i}\\) (called `denominator_state` in the code) satistfies
|
||||
|
||||
$$\hat{D}_{0} = 0 \hbox{ and } \hat{D}_{j+1} = e^{K_{j}} + e^{w} \hat{D}_{j}$$
|
||||
|
||||
The actual recurrent formula used are a tiny bit more complex, as for numerical stability we don't want to compute exponentials of big numbers. Usually the softmax is not computed as is, but the exponential of the maximum term is divided of the numerator and denominator:
|
||||
|
||||
$$\frac{e^{x_{i}}}{\sum_{j=1}^{n} e^{x_{j}}} = \frac{e^{x_{i} - M}}{\sum_{j=1}^{n} e^{x_{j} - M}}$$
|
||||
|
||||
with \\(M\\) the maximum of all \\(x_{j}\\). So here on top of saving the numerator state (\\(\hat{N}\\)) and the denominator state (\\(\hat{D}\\)) we also keep track of the maximum of all terms encountered in the exponentials. So we actually use
|
||||
|
||||
$$\tilde{N}_{i} = e^{-M_{i}} \hat{N}_{i} \hbox{ and } \tilde{D}_{i} = e^{-M_{i}} \hat{D}_{i}$$
|
||||
|
||||
defined by the following recurrent formulas:
|
||||
|
||||
$$\tilde{N}_{0} = 0 \hbox{ and } \tilde{N}_{j+1} = e^{K_{j} - q} V_{j} + e^{w + M_{j} - q} \tilde{N}_{j} \hbox{ where } q = \max(K_{j}, w + M_{j})$$
|
||||
|
||||
and
|
||||
|
||||
$$\tilde{D}_{0} = 0 \hbox{ and } \tilde{D}_{j+1} = e^{K_{j} - q} + e^{w + M_{j} - q} \tilde{D}_{j} \hbox{ where } q = \max(K_{j}, w + M_{j})$$
|
||||
|
||||
and \\(M_{j+1} = q\\). With those, we can then compute
|
||||
|
||||
$$N_{i} = e^{u + K_{i} - q} V_{i} + e^{M_{i}} \tilde{N}_{i} \hbox{ where } q = \max(u + K_{i}, M_{i})$$
|
||||
|
||||
and
|
||||
|
||||
$$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}_{i} \hbox{ where } q = \max(u + K_{i}, M_{i})$$
|
||||
|
||||
which finally gives us
|
||||
|
||||
$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$
|
@ -22,7 +22,7 @@ The model can be used to predict segmentation masks of any object of interest gi
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision.*
|
||||
*We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at \href{https://segment-anything.com}{https://segment-anything.com} to foster research into foundation models for computer vision.*
|
||||
|
||||
Tips:
|
||||
|
||||
@ -63,10 +63,8 @@ scores = outputs.iou_scores
|
||||
|
||||
Resources:
|
||||
|
||||
- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model.
|
||||
- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using the automatic mask generation pipeline.
|
||||
- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Run_inference_with_MedSAM_using_HuggingFace_Transformers.ipynb) for inference with MedSAM, a fine-tuned version of SAM on the medical domain.
|
||||
- [Demo notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SAM/Fine_tune_SAM_(segment_anything)_on_a_custom_dataset.ipynb) for fine-tuning the model on custom data.
|
||||
- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/segment_anything.ipynb) for using the model
|
||||
- [Demo notebook](https://github.com/huggingface/notebooks/blob/main/examples/automatic_mask_generation.ipynb) for using automatic mask generation pipeline.
|
||||
|
||||
## SamConfig
|
||||
|
||||
|
@ -1,45 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# SwiftFormer
|
||||
|
||||
## Overview
|
||||
|
||||
The SwiftFormer model was proposed in [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
|
||||
The SwiftFormer paper introduces a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations in the self-attention computation with linear element-wise multiplications. A series of models called 'SwiftFormer' is built based on this, which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Even their small variant achieves 78.5% top-1 ImageNet1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2× faster compared to MobileViT-v2.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Self-attention has become a defacto choice for capturing global context in various vision applications. However, its quadratic computational complexity with respect to image resolution limits its use in real-time applications, especially for deployment on resource-constrained mobile devices. Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2.*
|
||||
|
||||
Tips:
|
||||
- One can use the [`ViTImageProcessor`] API to prepare images for the model.
|
||||
|
||||
|
||||
This model was contributed by [shehan97](https://huggingface.co/shehan97).
|
||||
The original code can be found [here](https://github.com/Amshaker/SwiftFormer).
|
||||
|
||||
|
||||
## SwiftFormerConfig
|
||||
|
||||
[[autodoc]] SwiftFormerConfig
|
||||
|
||||
## SwiftFormerModel
|
||||
|
||||
[[autodoc]] SwiftFormerModel
|
||||
- forward
|
||||
|
||||
## SwiftFormerForImageClassification
|
||||
|
||||
[[autodoc]] SwiftFormerForImageClassification
|
||||
- forward
|
@ -19,15 +19,12 @@ specific language governing permissions and limitations under the License.
|
||||
<a href="https://huggingface.co/spaces/docs-demos/t5-base">
|
||||
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/papers/1910.10683">
|
||||
<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.10683-green">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
|
||||
The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by [Colin Raffel](https://huggingface.co/craffel), Noam Shazeer, [Adam Roberts](https://huggingface.co/adarob), Katherine Lee, Sharan Narang,
|
||||
Michael Matena, Yanqi Zhou, Wei Li, [Peter J. Liu](https://huggingface.co/peterjliu).
|
||||
The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
|
||||
Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
|
@ -50,27 +50,6 @@ Tips:
|
||||
information, see the [official models](https://huggingface.co/models?other=trocr>).
|
||||
- TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
|
||||
|
||||
## Resources
|
||||
|
||||
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TrOCR. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
||||
<PipelineTag pipeline="text-classification"/>
|
||||
|
||||
- A blog post on [Accelerating Document AI](https://huggingface.co/blog/document-ai) with TrOCR.
|
||||
- A blog post on how to [Document AI](https://github.com/philschmid/document-ai-transformers) with TrOCR.
|
||||
- A notebook on how to [finetune TrOCR on IAM Handwriting Database using Seq2SeqTrainer](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb).
|
||||
- A notebook on [inference with TrOCR](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb) and Gradio demo.
|
||||
- A notebook on [finetune TrOCR on the IAM Handwriting Database](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb) using native PyTorch.
|
||||
- A notebook on [evaluating TrOCR on the IAM test set](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Evaluating_TrOCR_base_handwritten_on_the_IAM_test_set.ipynb).
|
||||
|
||||
<PipelineTag pipeline="text-generation"/>
|
||||
|
||||
- [Casual language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) task guide.
|
||||
|
||||
⚡️ Inference
|
||||
|
||||
- An interactive-demo on [TrOCR handwritten character recognition](https://huggingface.co/spaces/nielsr/TrOCR-handwritten).
|
||||
|
||||
## Inference
|
||||
|
||||
TrOCR's [`VisionEncoderDecoder`] model accepts images as input and makes use of
|
||||
|
@ -105,9 +105,3 @@ The original code can be found [here](https://github.com/openai/whisper).
|
||||
|
||||
[[autodoc]] FlaxWhisperForConditionalGeneration
|
||||
- __call__
|
||||
|
||||
## FlaxWhisperForAudioClassification
|
||||
|
||||
[[autodoc]] FlaxWhisperForAudioClassification
|
||||
- __call__
|
||||
|
||||
|
@ -73,7 +73,7 @@ The following "Usage in Trainer" takes mpirun in Intel® MPI library as an examp
|
||||
|
||||
|
||||
## Usage in Trainer
|
||||
To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--ddp_backend ccl`** in the command arguments.
|
||||
To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--xpu_backend ccl`** in the command arguments.
|
||||
|
||||
Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
|
||||
|
||||
@ -95,7 +95,7 @@ The following command enables training with 2 processes on one Xeon node, with o
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/debug_squad/ \
|
||||
--no_cuda \
|
||||
--ddp_backend ccl \
|
||||
--xpu_backend ccl \
|
||||
--use_ipex
|
||||
```
|
||||
The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
|
||||
@ -124,7 +124,7 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/debug_squad/ \
|
||||
--no_cuda \
|
||||
--ddp_backend ccl \
|
||||
--xpu_backend ccl \
|
||||
--use_ipex \
|
||||
--bf16
|
||||
```
|
||||
|
@ -272,7 +272,7 @@ It's easy to see from the bottom diagram how PP has less dead zones, where GPUs
|
||||
|
||||
Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
|
||||
|
||||
PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
|
||||
PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
|
||||
|
||||
Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
|
||||
|
||||
|
@ -41,7 +41,7 @@ The main tool for preprocessing textual data is a [tokenizer](main_classes/token
|
||||
|
||||
<Tip>
|
||||
|
||||
If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining.
|
||||
If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referrred to as the *vocab*) during pretraining.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
@ -112,7 +112,6 @@ Ready-made configurations include the following architectures:
|
||||
- RoFormer
|
||||
- SegFormer
|
||||
- SqueezeBERT
|
||||
- SwiftFormer
|
||||
- Swin Transformer
|
||||
- T5
|
||||
- Table Transformer
|
||||
|
@ -282,7 +282,7 @@ At this point, only three steps remain:
|
||||
... args=training_args,
|
||||
... train_dataset=encoded_minds["train"],
|
||||
... eval_dataset=encoded_minds["test"],
|
||||
... tokenizer=processor,
|
||||
... tokenizer=processor.feature_extractor,
|
||||
... data_collator=data_collator,
|
||||
... compute_metrics=compute_metrics,
|
||||
... )
|
||||
|
@ -30,7 +30,7 @@ The task illustrated in this tutorial is supported by the following model archit
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
|
||||
[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
|
||||
<!--End of the generated tip-->
|
||||
|
||||
</Tip>
|
||||
|
@ -33,8 +33,8 @@ You can finetune other architectures for causal language modeling following the
|
||||
Choose one of the following architectures:
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
|
||||
|
||||
[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MVP](../model_doc/mvp), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
|
||||
|
||||
<!--End of the generated tip-->
|
||||
|
||||
|
@ -31,7 +31,7 @@ The task illustrated in this tutorial is supported by the following model archit
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
|
||||
|
||||
<!--End of the generated tip-->
|
||||
|
@ -28,7 +28,7 @@ The task illustrated in this tutorial is supported by the following model archit
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
|
||||
|
||||
<!--End of the generated tip-->
|
||||
|
@ -1,558 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Text to speech
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Text-to-speech (TTS) is the task of creating natural-sounding speech from text, where the speech can be generated in multiple
|
||||
languages and for multiple speakers. The only text-to-speech model currently available in 🤗 Transformers
|
||||
is [SpeechT5](model_doc/speecht5), though more will be added in the future. SpeechT5 is pre-trained on a combination of
|
||||
speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text
|
||||
and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5
|
||||
supports multiple speakers through x-vector speaker embeddings.
|
||||
|
||||
This guide illustrates how to:
|
||||
|
||||
1. Fine-tune [SpeechT5](model_doc/speecht5) that was originally trained on English speech on the Dutch (`nl`) language subset of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset.
|
||||
2. Use your fine-tuned model for inference.
|
||||
|
||||
Before you begin, make sure you have all the necessary libraries installed:
|
||||
|
||||
```bash
|
||||
pip install datasets soundfile speechbrain accelerate
|
||||
```
|
||||
|
||||
Install 🤗Transformers from source as not all the SpeechT5 features have been merged into an official release yet:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
To follow this guide you will need a GPU. If you're working in a notebook, run the following line to check if a GPU is available:
|
||||
|
||||
```bash
|
||||
!nvidia-smi
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import notebook_login
|
||||
|
||||
>>> notebook_login()
|
||||
```
|
||||
|
||||
## Load the dataset
|
||||
|
||||
[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) is a large-scale multilingual speech corpus consisting of
|
||||
data sourced from 2009-2020 European Parliament event recordings. It contains labelled audio-transcription data for 15
|
||||
European languages. In this guide, we are using the Dutch language subset, feel free to pick another subset.
|
||||
|
||||
Note that VoxPopuli or any other automated speech recognition (ASR) dataset may not be the most suitable
|
||||
option for training TTS models. The features that make it beneficial for ASR, such as excessive background noise, are
|
||||
typically undesirable in TTS. However, finding top-quality, multilingual, and multi-speaker TTS datasets can be quite
|
||||
challenging.
|
||||
|
||||
Let's load the data:
|
||||
|
||||
```py
|
||||
>>> from datasets import load_dataset, Audio
|
||||
|
||||
>>> dataset = load_dataset("facebook/voxpopuli", "nl", split="train")
|
||||
>>> len(dataset)
|
||||
20968
|
||||
```
|
||||
|
||||
20968 examples should be sufficient for fine-tuning. SpeechT5 expects audio data to have a sampling rate of 16 kHz, so
|
||||
make sure the examples in the dataset meet this requirement:
|
||||
|
||||
```py
|
||||
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
||||
```
|
||||
|
||||
## Preprocess the data
|
||||
|
||||
Let's begin by defining the model checkpoint to use and loading the appropriate processor:
|
||||
|
||||
```py
|
||||
>>> from transformers import SpeechT5Processor
|
||||
|
||||
>>> checkpoint = "microsoft/speecht5_tts"
|
||||
>>> processor = SpeechT5Processor.from_pretrained(checkpoint)
|
||||
```
|
||||
|
||||
### Text cleanup for SpeechT5 tokenization
|
||||
|
||||
Start by cleaning up the text data. You'll need the tokenizer part of the processor to process the text:
|
||||
|
||||
```py
|
||||
>>> tokenizer = processor.tokenizer
|
||||
```
|
||||
|
||||
The dataset examples contain `raw_text` and `normalized_text` features. When deciding which feature to use as the text input,
|
||||
consider that the SpeechT5 tokenizer doesn't have any tokens for numbers. In `normalized_text` the numbers are written
|
||||
out as text. Thus, it is a better fit, and we recommend using `normalized_text` as input text.
|
||||
|
||||
Because SpeechT5 was trained on the English language, it may not recognize certain characters in the Dutch dataset. If
|
||||
left as is, these characters will be converted to `<unk>` tokens. However, in Dutch, certain characters like `à` are
|
||||
used to stress syllables. In order to preserve the meaning of the text, we can replace this character with a regular `a`.
|
||||
|
||||
To identify unsupported tokens, extract all unique characters in the dataset using the `SpeechT5Tokenizer` which
|
||||
works with characters as tokens. To do this, write the `extract_all_chars` mapping function that concatenates
|
||||
the transcriptions from all examples into one string and converts it to a set of characters.
|
||||
Make sure to set `batched=True` and `batch_size=-1` in `dataset.map()` so that all transcriptions are available at once for
|
||||
the mapping function.
|
||||
|
||||
```py
|
||||
>>> def extract_all_chars(batch):
|
||||
... all_text = " ".join(batch["normalized_text"])
|
||||
... vocab = list(set(all_text))
|
||||
... return {"vocab": [vocab], "all_text": [all_text]}
|
||||
|
||||
|
||||
>>> vocabs = dataset.map(
|
||||
... extract_all_chars,
|
||||
... batched=True,
|
||||
... batch_size=-1,
|
||||
... keep_in_memory=True,
|
||||
... remove_columns=dataset.column_names,
|
||||
... )
|
||||
|
||||
>>> dataset_vocab = set(vocabs["vocab"][0])
|
||||
>>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
|
||||
```
|
||||
|
||||
Now you have two sets of characters: one with the vocabulary from the dataset and one with the vocabulary from the tokenizer.
|
||||
To identify any unsupported characters in the dataset, you can take the difference between these two sets. The resulting
|
||||
set will contain the characters that are in the dataset but not in the tokenizer.
|
||||
|
||||
```py
|
||||
>>> dataset_vocab - tokenizer_vocab
|
||||
{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'}
|
||||
```
|
||||
|
||||
To handle the unsupported characters identified in the previous step, define a function that maps these characters to
|
||||
valid tokens. Note that spaces are already replaced by `▁` in the tokenizer and don't need to be handled separately.
|
||||
|
||||
```py
|
||||
>>> replacements = [
|
||||
... ("à", "a"),
|
||||
... ("ç", "c"),
|
||||
... ("è", "e"),
|
||||
... ("ë", "e"),
|
||||
... ("í", "i"),
|
||||
... ("ï", "i"),
|
||||
... ("ö", "o"),
|
||||
... ("ü", "u"),
|
||||
... ]
|
||||
|
||||
|
||||
>>> def cleanup_text(inputs):
|
||||
... for src, dst in replacements:
|
||||
... inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
|
||||
... return inputs
|
||||
|
||||
|
||||
>>> dataset = dataset.map(cleanup_text)
|
||||
```
|
||||
|
||||
Now that you have dealt with special characters in the text, it's time to shift focus to the audio data.
|
||||
|
||||
### Speakers
|
||||
|
||||
The VoxPopuli dataset includes speech from multiple speakers, but how many speakers are represented in the dataset? To
|
||||
determine this, we can count the number of unique speakers and the number of examples each speaker contributes to the dataset.
|
||||
With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of
|
||||
speakers and examples in the data.
|
||||
|
||||
```py
|
||||
>>> from collections import defaultdict
|
||||
|
||||
>>> speaker_counts = defaultdict(int)
|
||||
|
||||
>>> for speaker_id in dataset["speaker_id"]:
|
||||
... speaker_counts[speaker_id] += 1
|
||||
```
|
||||
|
||||
By plotting a histogram you can get a sense of how much data there is for each speaker.
|
||||
|
||||
```py
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> plt.figure()
|
||||
>>> plt.hist(speaker_counts.values(), bins=20)
|
||||
>>> plt.ylabel("Speakers")
|
||||
>>> plt.xlabel("Examples")
|
||||
>>> plt.show()
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_speakers_histogram.png" alt="Speakers histogram"/>
|
||||
</div>
|
||||
|
||||
The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while
|
||||
around ten speakers have more than 500 examples. To improve training efficiency and balance the dataset, we can limit
|
||||
the data to speakers with between 100 and 400 examples.
|
||||
|
||||
```py
|
||||
>>> def select_speaker(speaker_id):
|
||||
... return 100 <= speaker_counts[speaker_id] <= 400
|
||||
|
||||
|
||||
>>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])
|
||||
```
|
||||
|
||||
Let's check how many speakers remain:
|
||||
|
||||
```py
|
||||
>>> len(set(dataset["speaker_id"]))
|
||||
42
|
||||
```
|
||||
|
||||
Let's see how many examples are left:
|
||||
|
||||
```py
|
||||
>>> len(dataset)
|
||||
9973
|
||||
```
|
||||
|
||||
You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient.
|
||||
|
||||
Note that some speakers with few examples may actually have more audio available if the examples are long. However,
|
||||
determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a
|
||||
time-consuming process that involves loading and decoding each audio file. As such, we have chosen to skip this step here.
|
||||
|
||||
### Speaker embeddings
|
||||
|
||||
To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example.
|
||||
The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics.
|
||||
To generate these speaker embeddings, use the pre-trained [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb)
|
||||
model from SpeechBrain.
|
||||
|
||||
Create a function `create_speaker_embedding()` that takes an input audio waveform and outputs a 512-element vector
|
||||
containing the corresponding speaker embedding.
|
||||
|
||||
```py
|
||||
>>> import os
|
||||
>>> import torch
|
||||
>>> from speechbrain.pretrained import EncoderClassifier
|
||||
|
||||
>>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
|
||||
|
||||
>>> device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
>>> speaker_model = EncoderClassifier.from_hparams(
|
||||
... source=spk_model_name,
|
||||
... run_opts={"device": device},
|
||||
... savedir=os.path.join("/tmp", spk_model_name),
|
||||
... )
|
||||
|
||||
|
||||
>>> def create_speaker_embedding(waveform):
|
||||
... with torch.no_grad():
|
||||
... speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
|
||||
... speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
|
||||
... speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
|
||||
... return speaker_embeddings
|
||||
```
|
||||
|
||||
It's important to note that the `speechbrain/spkrec-xvect-voxceleb` model was trained on English speech from the VoxCeleb
|
||||
dataset, whereas the training examples in this guide are in Dutch. While we believe that this model will still generate
|
||||
reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases.
|
||||
|
||||
For optimal results, we recommend training an X-vector model on the target speech first. This will ensure that the model
|
||||
is better able to capture the unique voice characteristics present in the Dutch language.
|
||||
|
||||
### Processing the dataset
|
||||
|
||||
Finally, let's process the data into the format the model expects. Create a `prepare_dataset` function that takes in a
|
||||
single example and uses the `SpeechT5Processor` object to tokenize the input text and load the target audio into a log-mel spectrogram.
|
||||
It should also add the speaker embeddings as an additional input.
|
||||
|
||||
```py
|
||||
>>> def prepare_dataset(example):
|
||||
... audio = example["audio"]
|
||||
|
||||
... example = processor(
|
||||
... text=example["normalized_text"],
|
||||
... audio_target=audio["array"],
|
||||
... sampling_rate=audio["sampling_rate"],
|
||||
... return_attention_mask=False,
|
||||
... )
|
||||
|
||||
... # strip off the batch dimension
|
||||
... example["labels"] = example["labels"][0]
|
||||
|
||||
... # use SpeechBrain to obtain x-vector
|
||||
... example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
|
||||
|
||||
... return example
|
||||
```
|
||||
|
||||
Verify the processing is correct by looking at a single example:
|
||||
|
||||
```py
|
||||
>>> processed_example = prepare_dataset(dataset[0])
|
||||
>>> list(processed_example.keys())
|
||||
['input_ids', 'labels', 'stop_labels', 'speaker_embeddings']
|
||||
```
|
||||
|
||||
Speaker embeddings should be a 512-element vector:
|
||||
|
||||
```py
|
||||
>>> processed_example["speaker_embeddings"].shape
|
||||
(512,)
|
||||
```
|
||||
|
||||
The labels should be a log-mel spectrogram with 80 mel bins.
|
||||
|
||||
```py
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> plt.figure()
|
||||
>>> plt.imshow(processed_example["labels"].T)
|
||||
>>> plt.show()
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_1.png" alt="Log-mel spectrogram with 80 mel bins"/>
|
||||
</div>
|
||||
|
||||
Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies
|
||||
at the bottom and high frequencies at the top of a plot. However, when plotting spectrograms as an image using the matplotlib library,
|
||||
the y-axis is flipped and the spectrograms appear upside down.
|
||||
|
||||
Now apply the processing function to the entire dataset. This will take between 5 and 10 minutes.
|
||||
|
||||
```py
|
||||
>>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
|
||||
```
|
||||
|
||||
You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens).
|
||||
Remove those examples from the dataset. Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens.
|
||||
|
||||
```py
|
||||
>>> def is_not_too_long(input_ids):
|
||||
... input_length = len(input_ids)
|
||||
... return input_length < 200
|
||||
|
||||
|
||||
>>> dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
|
||||
>>> len(dataset)
|
||||
8259
|
||||
```
|
||||
|
||||
Next, create a basic train/test split:
|
||||
|
||||
```py
|
||||
>>> dataset = dataset.train_test_split(test_size=0.1)
|
||||
```
|
||||
|
||||
### Data collator
|
||||
|
||||
In order to combine multiple examples into a batch, you need to define a custom data collator. This collator will pad shorter sequences with padding
|
||||
tokens, ensuring that all examples have the same length. For the spectrogram labels, the padded portions are replaced with the special value `-100`. This special value
|
||||
instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss.
|
||||
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from typing import Any, Dict, List, Union
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class TTSDataCollatorWithPadding:
|
||||
... processor: Any
|
||||
|
||||
... def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
||||
... input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
|
||||
... label_features = [{"input_values": feature["labels"]} for feature in features]
|
||||
... speaker_features = [feature["speaker_embeddings"] for feature in features]
|
||||
|
||||
... # collate the inputs and targets into a batch
|
||||
... batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")
|
||||
|
||||
... # replace padding with -100 to ignore loss correctly
|
||||
... batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)
|
||||
|
||||
... # not used during fine-tuning
|
||||
... del batch["decoder_attention_mask"]
|
||||
|
||||
... # round down target lengths to multiple of reduction factor
|
||||
... if model.config.reduction_factor > 1:
|
||||
... target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
|
||||
... target_lengths = target_lengths.new(
|
||||
... [length - length % model.config.reduction_factor for length in target_lengths]
|
||||
... )
|
||||
... max_length = max(target_lengths)
|
||||
... batch["labels"] = batch["labels"][:, :max_length]
|
||||
|
||||
... # also add in the speaker embeddings
|
||||
... batch["speaker_embeddings"] = torch.tensor(speaker_features)
|
||||
|
||||
... return batch
|
||||
```
|
||||
|
||||
In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. In other words, it throws away every
|
||||
other timestep from the target sequence. The decoder then predicts a sequence that is twice as long. Since the original
|
||||
target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a
|
||||
multiple of 2.
|
||||
|
||||
```py
|
||||
>>> data_collator = TTSDataCollatorWithPadding(processor=processor)
|
||||
```
|
||||
|
||||
## Train the model
|
||||
|
||||
Load the pre-trained model from the same checkpoint as you used for loading the processor:
|
||||
|
||||
```py
|
||||
>>> from transformers import SpeechT5ForTextToSpeech
|
||||
|
||||
>>> model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
||||
```
|
||||
|
||||
The `use_cache=True` option is incompatible with gradient checkpointing. Disable it for training.
|
||||
|
||||
```py
|
||||
>>> model.config.use_cache = False
|
||||
```
|
||||
|
||||
Define the training arguments. Here we are not computing any evaluation metrics during the training process. Instead, we'll
|
||||
only look at the loss:
|
||||
|
||||
```python
|
||||
>>> from transformers import Seq2SeqTrainingArguments
|
||||
|
||||
>>> training_args = Seq2SeqTrainingArguments(
|
||||
... output_dir="speecht5_finetuned_voxpopuli_nl", # change to a repo name of your choice
|
||||
... per_device_train_batch_size=4,
|
||||
... gradient_accumulation_steps=8,
|
||||
... learning_rate=1e-5,
|
||||
... warmup_steps=500,
|
||||
... max_steps=4000,
|
||||
... gradient_checkpointing=True,
|
||||
... fp16=True,
|
||||
... evaluation_strategy="steps",
|
||||
... per_device_eval_batch_size=2,
|
||||
... save_steps=1000,
|
||||
... eval_steps=1000,
|
||||
... logging_steps=25,
|
||||
... report_to=["tensorboard"],
|
||||
... load_best_model_at_end=True,
|
||||
... greater_is_better=False,
|
||||
... label_names=["labels"],
|
||||
... push_to_hub=True,
|
||||
... )
|
||||
```
|
||||
|
||||
Instantiate the `Trainer` object and pass the model, dataset, and data collator to it.
|
||||
|
||||
```py
|
||||
>>> from transformers import Seq2SeqTrainer
|
||||
|
||||
>>> trainer = Seq2SeqTrainer(
|
||||
... args=training_args,
|
||||
... model=model,
|
||||
... train_dataset=dataset["train"],
|
||||
... eval_dataset=dataset["test"],
|
||||
... data_collator=data_collator,
|
||||
... tokenizer=processor,
|
||||
... )
|
||||
```
|
||||
|
||||
And with that, you're ready to start training! Training will take several hours. Depending on your GPU,
|
||||
it is possible that you will encounter a CUDA "out-of-memory" error when you start training. In this case, you can reduce
|
||||
the `per_device_train_batch_size` incrementally by factors of 2 and increase `gradient_accumulation_steps` by 2x to compensate.
|
||||
|
||||
```py
|
||||
>>> trainer.train()
|
||||
```
|
||||
|
||||
Push the final model to the 🤗 Hub:
|
||||
|
||||
```py
|
||||
>>> trainer.push_to_hub()
|
||||
```
|
||||
|
||||
## Inference
|
||||
|
||||
Great, now that you've fine-tuned a model, you can use it for inference!
|
||||
Load the model from the 🤗 Hub (make sure to use your account name in the following code snippet):
|
||||
|
||||
```py
|
||||
>>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl")
|
||||
```
|
||||
|
||||
Pick an example, here we'll take one from the test dataset. Obtain a speaker embedding.
|
||||
|
||||
```py
|
||||
>>> example = dataset["test"][304]
|
||||
>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
||||
```
|
||||
|
||||
Define some input text and tokenize it.
|
||||
|
||||
```py
|
||||
>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
|
||||
```
|
||||
|
||||
Preprocess the input text:
|
||||
|
||||
```py
|
||||
>>> inputs = processor(text=text, return_tensors="pt")
|
||||
```
|
||||
|
||||
Create a spectrogram with your model:
|
||||
|
||||
```py
|
||||
>>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
|
||||
```
|
||||
|
||||
Visualize the spectrogram, if you'd like to:
|
||||
|
||||
```py
|
||||
>>> plt.figure()
|
||||
>>> plt.imshow(spectrogram.T)
|
||||
>>> plt.show()
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_2.png" alt="Generated log-mel spectrogram"/>
|
||||
</div>
|
||||
|
||||
Finally, use the vocoder to turn the spectrogram into sound.
|
||||
|
||||
```py
|
||||
>>> with torch.no_grad():
|
||||
... speech = vocoder(spectrogram)
|
||||
|
||||
>>> from IPython.display import Audio
|
||||
|
||||
>>> Audio(speech.numpy(), rate=16000)
|
||||
```
|
||||
|
||||
In our experience, obtaining satisfactory results from this model can be challenging. The quality of the speaker
|
||||
embeddings appears to be a significant factor. Since SpeechT5 was pre-trained with English x-vectors, it performs best
|
||||
when using English speaker embeddings. If the synthesized speech sounds poor, try using a different speaker embedding.
|
||||
|
||||
Increasing the training duration is also likely to enhance the quality of the results. Even so, the speech clearly is Dutch instead of English, and it does
|
||||
capture the voice characteristics of the speaker (compare to the original audio in the example).
|
||||
Another thing to experiment with is the model's configuration. For example, try using `config.reduction_factor = 1` to
|
||||
see if this improves the results.
|
||||
|
||||
Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it
|
||||
may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please
|
||||
use TTS judiciously and responsibly.
|
@ -212,12 +212,20 @@ Example:
|
||||
```"""
|
||||
|
||||
```
|
||||
3 steps are required to debug the docstring examples:
|
||||
1. In order to properly run the test, **an extra line has to be added** at the end of the docstring. This can be automatically done on any file using:
|
||||
```bash
|
||||
python utils/prepare_for_doc_test.py <path_to_file_or_dir>
|
||||
```
|
||||
|
||||
Just run the following line to automatically test every docstring example in the desired file:
|
||||
2. Then, you can use the following line to automatically test every docstring example in the desired file:
|
||||
```bash
|
||||
pytest --doctest-modules <path_to_file_or_dir>
|
||||
```
|
||||
If the file has a markdown extention, you should add the `--doctest-glob="*.mdx"` argument.
|
||||
3. Once you are done debugging, you need to remove the extra line added in step **1.** by running the following:
|
||||
```bash
|
||||
python utils/prepare_for_doc_test.py <path_to_file_or_dir> --remove_new_line
|
||||
```
|
||||
|
||||
### Run only modified tests
|
||||
|
||||
|
@ -1,132 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Running tools on inference endpoints
|
||||
|
||||
<Tip>
|
||||
|
||||
This document is about running tools on inference endpoints so that agents may use these tools remotely.
|
||||
If you do not know what tools and agents are in the context of transformers, we recommend you read the
|
||||
[Transformers Agents](transformers_agents) page first.
|
||||
|
||||
</Tip>
|
||||
|
||||
Agents are designed to use tools in order to respond to a natural language query. They are setup so as to load tools
|
||||
locally and use them directly in the runtime they're at.
|
||||
|
||||
However, some of these tools can be heavy; tools that handle images, long text, or audio signals may need a
|
||||
significant amount of memory in order to perform inference. Tools that generate images through a diffusion
|
||||
process, may require significant compute in order to perform the multiple steps they need; but end users
|
||||
may not benefit from the powerful setups required to use them.
|
||||
|
||||
This is why we have support for **remote** tools: these have an API that can be called from the runtime, offloading
|
||||
the processing to the remote API. In this guide we'll explore how to set up an inference endpoint for a given tool
|
||||
to leverage it with the agents.
|
||||
|
||||
Inference endpoints are one solution to handle remote tools; but they're not the only one. We integrate with
|
||||
[`gradio_tools`](custom_tools#leveraging-gradiotools) that also offers remote tools, and we'll continue adding
|
||||
guides to other alternatives for remote tools.
|
||||
|
||||
## Inference Endpoints
|
||||
|
||||
|
||||
[Inference Endpoints](https://huggingface.co/inference-endpoints) is a paid Hugging Face solution to easily deploy
|
||||
Transformers and Diffusers models on a fully-managed infrastructure. It has default deployment options for
|
||||
transformers and diffusers, but given that we're using a specific type of object here, tools, we'll set up a custom
|
||||
handler to get it to work.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Inference Endpoints are a paid hosting service by Hugging Face, which needs to have an organization setup with
|
||||
billing enabled.
|
||||
|
||||
</Tip>
|
||||
|
||||
Tools are Spaces by default in Transformers. When calling `push_to_hub` on a tool, you're effectively pushing
|
||||
the code to a Space on the Hugging Face Hub under a namespace that you own. There are many tools living on the
|
||||
[`huggingface-tools` namespace](https://huggingface.co/huggingface-tools); having them be Spaces by default means
|
||||
that users can play around with the tool directly in the browser.
|
||||
|
||||
However, Inference Endpoints only work with **model** repositories. We'll therefore have to create a model
|
||||
repository to act as a proxy for the Space. That model repository will contain the `handler.py` file to serve
|
||||
our tool through an inference endpoint.
|
||||
|
||||
For demonstration purposes, we'll consider that you already have a tool handy that you'd like to use remotely. If
|
||||
you'd like to setup your custom tool, we recommend reading the [Custom Tool](custom_tools#leveraging-gradiotools)
|
||||
guide.
|
||||
|
||||
We'll try and deploy the `huggingface-tools/text-to-video` tool to an inference endpoint. We have it available as
|
||||
a gradio Space [here](https://huggingface.co/huggingface-tools/text-to-video).
|
||||
|
||||
### Setting up the repository
|
||||
|
||||
We'll start by creating a model repository that will serve as a serving point for this tool.
|
||||
It can be public or private; for the sake of this tutorial we'll keep this one public, but having it set to
|
||||
private doesn't interfere with the inference endpoint setup.
|
||||
|
||||
The repository is created and is available [here](https://huggingface.co/huggingface-tools/text-to-video).
|
||||
In it, you'll see there is a custom handler file, called
|
||||
[`handler.py`](https://huggingface.co/huggingface-tools/text-to-video/blob/main/handler.py), as well as a traditional
|
||||
requirements file called
|
||||
[`requirements.txt`](https://huggingface.co/huggingface-tools/text-to-video/blob/main/requirements.txt).
|
||||
|
||||
#### Handler file
|
||||
|
||||
The handler file exposes an `EndpointHandler`, which serves as the link between the requests you'll be doing to the
|
||||
remote tool and the tool itself. It should:
|
||||
|
||||
- Instantiate the tool in its initialization method
|
||||
- Have a `__call__` method which will take the serialized input and return the computed result.
|
||||
|
||||
For text-to-text tools, the handler file is very simple; it looks like the following:
|
||||
|
||||
```python
|
||||
from transformers.tools import load_tool
|
||||
|
||||
|
||||
class EndpointHandler:
|
||||
def __init__(self, path=""):
|
||||
self.tool = load_tool("huggingface-tools/text-to-video")
|
||||
self.tool.setup()
|
||||
|
||||
def __call__(self, data):
|
||||
inputs = data.pop("inputs", data)
|
||||
return self.tool(**inputs)
|
||||
```
|
||||
|
||||
However, it is different if handling different data types as it will need to serialize this data type.
|
||||
This guide will be completed to include different serialization for text, image, audio and video.
|
||||
|
||||
#### Requirement file
|
||||
|
||||
The requirement file needs to specify all requirements necessary to run the tool. The basic dependencies are the
|
||||
following:
|
||||
|
||||
```text
|
||||
transformers>=4.29.0
|
||||
accelerate
|
||||
```
|
||||
|
||||
but you may need to include any and all other dependencies needed by your tool
|
||||
|
||||
### Spinning up an endpoint
|
||||
|
||||
Once we're done creating the repository, we can go ahead and create our first endpoint. Head over to
|
||||
[the Inference Endpoints UI](https://ui.endpoints.huggingface.co/endpoints) and create your first endpoint.
|
||||
|
||||
If the repository is setup correctly, it should spin up directly without issue.
|
||||
|
||||
In case you encounter a "Failed" deployment, we recommend checking out
|
||||
[this guide](https://huggingface.co/docs/inference-endpoints/guides/logs) on checking out the logs of an inference
|
||||
endpoint.
|
||||
|
||||
TODO add images
|
@ -201,7 +201,7 @@ AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launc
|
||||
### Converting a model for AWS Neuron
|
||||
|
||||
Convert a model for AWS NEURON using the same code from [Using TorchScript in
|
||||
Python](torchscript#using-torchscript-in-python) to trace a `BertModel`. Import the
|
||||
Python](serialization#using-torchscript-in-python) to trace a `BertModel`. Import the
|
||||
`torch.neuron` framework extension to access the components of the Neuron SDK through a
|
||||
Python API:
|
||||
|
||||
|
@ -1,331 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Transformers Agent
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Transformers Agent is an experimental API which is subject to change at any time. Results returned by the agents
|
||||
can vary as the APIs or underlying models are prone to change.
|
||||
|
||||
</Tip>
|
||||
|
||||
Transformers version v4.29.0, building on the concept of *tools* and *agents*. You can play with in
|
||||
[this colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
|
||||
|
||||
In short, it provides a natural language API on top of transformers: we define a set of curated tools and design an
|
||||
agent to interpret natural language and to use these tools. It is extensible by design; we curated some relevant tools,
|
||||
but we'll show you how the system can be extended easily to use any tool developed by the community.
|
||||
|
||||
Let's start with a few examples of what can be achieved with this new API. It is particularly powerful when it comes
|
||||
to multimodal tasks, so let's take it for a spin to generate images and read text out loud.
|
||||
|
||||
```py
|
||||
agent.run("Caption the following image", image=image)
|
||||
```
|
||||
|
||||
| **Input** | **Output** |
|
||||
|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
|
||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
|
||||
|
||||
---
|
||||
|
||||
```py
|
||||
agent.run("Read the following text out loud", text=text)
|
||||
```
|
||||
| **Input** | **Output** |
|
||||
|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
|
||||
| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
|
||||
|
||||
---
|
||||
|
||||
```py
|
||||
agent.run(
|
||||
"In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
|
||||
document=document,
|
||||
)
|
||||
```
|
||||
| **Input** | **Output** |
|
||||
|-----------------------------------------------------------------------------------------------------------------------------|----------------|
|
||||
| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
|
||||
|
||||
## Quickstart
|
||||
|
||||
Before being able to use `agent.run`, you will need to instantiate an agent, which is a large language model (LLM).
|
||||
We provide support for openAI models as well as opensource alternatives from BigCode and OpenAssistant. The openAI
|
||||
models perform better (but require you to have an openAI API key, so cannot be used for free); Hugging Face is
|
||||
providing free access to endpoints for BigCode and OpenAssistant models.
|
||||
|
||||
To start with, please install the `agents` extras in order to install all default dependencies.
|
||||
```bash
|
||||
pip install transformers[agents]
|
||||
```
|
||||
|
||||
To use openAI models, you instantiate an [`OpenAiAgent`] after installing the `openai` dependency:
|
||||
|
||||
```bash
|
||||
pip install openai
|
||||
```
|
||||
|
||||
|
||||
```py
|
||||
from transformers import OpenAiAgent
|
||||
|
||||
agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
|
||||
```
|
||||
|
||||
To use BigCode or OpenAssistant, start by logging in to have access to the Inference API:
|
||||
|
||||
```py
|
||||
from huggingface_hub import login
|
||||
|
||||
login("<YOUR_TOKEN>")
|
||||
```
|
||||
|
||||
Then, instantiate the agent
|
||||
|
||||
```py
|
||||
from transformers import HfAgent
|
||||
|
||||
# Starcoder
|
||||
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
|
||||
# StarcoderBase
|
||||
# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
|
||||
# OpenAssistant
|
||||
# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
|
||||
```
|
||||
|
||||
This is using the inference API that Hugging Face provides for free at the moment. If you have your own inference
|
||||
endpoint for this model (or another one) you can replace the URL above with your URL endpoint.
|
||||
|
||||
<Tip>
|
||||
|
||||
StarCoder and OpenAssistant are free to use and perform admirably well on simple tasks. However, the checkpoints
|
||||
don't hold up when handling more complex prompts. If you're facing such an issue, we recommend trying out the OpenAI
|
||||
model which, while sadly not open-source, performs better at this given time.
|
||||
|
||||
</Tip>
|
||||
|
||||
You're now good to go! Let's dive into the two APIs that you now have at your disposal.
|
||||
|
||||
### Single execution (run)
|
||||
|
||||
The single execution method is when using the [`~Agent.run`] method of the agent:
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of rivers and lakes.")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
|
||||
|
||||
It automatically selects the tool (or tools) appropriate for the task you want to perform and runs them appropriately. It
|
||||
can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely
|
||||
the agent is to fail).
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of the sea then transform the picture to add an island")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
|
||||
|
||||
<br/>
|
||||
|
||||
|
||||
Every [`~Agent.run`] operation is independent, so you can run it several times in a row with different tasks.
|
||||
|
||||
Note that your `agent` is just a large-language model, so small variations in your prompt might yield completely
|
||||
different results. It's important to explain as clearly as possible the task you want to perform. We go more in-depth
|
||||
on how to write good prompts [here](custom_tools#writing-good-user-inputs).
|
||||
|
||||
If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying
|
||||
variables that you would like the agent to use. For example, you could generate the first image of rivers and lakes,
|
||||
and ask the model to update that picture to add an island by doing the following:
|
||||
|
||||
```python
|
||||
picture = agent.run("Generate a picture of rivers and lakes.")
|
||||
updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
This can be helpful when the model is unable to understand your request and mixes tools. An example would be:
|
||||
|
||||
```py
|
||||
agent.run("Draw me the picture of a capybara swimming in the sea")
|
||||
```
|
||||
|
||||
Here, the model could interpret in two ways:
|
||||
- Have the `text-to-image` generate a capybara swimming in the sea
|
||||
- Or, have the `text-to-image` generate capybara, then use the `image-transformation` tool to have it swim in the sea
|
||||
|
||||
In case you would like to force the first scenario, you could do so by passing it the prompt as an argument:
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
### Chat-based execution (chat)
|
||||
|
||||
The agent also has a chat-based approach, using the [`~Agent.chat`] method:
|
||||
|
||||
```py
|
||||
agent.chat("Generate a picture of rivers and lakes")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
|
||||
|
||||
```py
|
||||
agent.chat("Transform the picture so that there is a rock in there")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
|
||||
|
||||
<br/>
|
||||
|
||||
This is an interesting approach when you want to keep the state across instructions. It's better for experimentation,
|
||||
but will tend to be much better at single instructions rather than complex instructions (which the [`~Agent.run`]
|
||||
method is better at handling).
|
||||
|
||||
This method can also take arguments if you would like to pass non-text types or specific prompts.
|
||||
|
||||
### ⚠️ Remote execution
|
||||
|
||||
For demonstration purposes and so that this can be used with all setups, we have created remote executors for several
|
||||
of the default tools the agent has access. These are created using
|
||||
[inference endpoints](https://huggingface.co/inference-endpoints). To see how to set up remote executors tools yourself,
|
||||
we recommend reading the [custom tool guide](./custom_tools).
|
||||
|
||||
In order to run with remote tools, specifying `remote=True` to either [`~Agent.run`] or [`~Agent.chat`] is sufficient.
|
||||
|
||||
For example, the following command could be run on any device efficiently, without needing significant RAM or GPU:
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of rivers and lakes", remote=True)
|
||||
```
|
||||
|
||||
The same can be said for [`~Agent.chat`]:
|
||||
|
||||
```py
|
||||
agent.chat("Draw me a picture of rivers and lakes", remote=True)
|
||||
```
|
||||
|
||||
### What's happening here? What are tools, and what are agents?
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
|
||||
|
||||
#### Agents
|
||||
|
||||
The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools.
|
||||
|
||||
LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the
|
||||
LLM gives a small sample of code performing a task with a set of tools. This prompt is then completed by the
|
||||
task you give your agent and the description of the tools you give it. This way it gets access to the doc of the
|
||||
tools you are using, especially their expected inputs and outputs, and can generate the relevant code.
|
||||
|
||||
#### Tools
|
||||
|
||||
Tools are very simple: they're a single function, with a name, and a description. We then use these tools' descriptions
|
||||
to prompt the agent. Through the prompt, we show the agent how it would leverage tools to perform what was
|
||||
requested in the query.
|
||||
|
||||
This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools.
|
||||
Pipelines are more refactored and often combine several tasks in one. Tools are meant to be focused on
|
||||
one very simple task only.
|
||||
|
||||
#### Code-execution?!
|
||||
|
||||
This code is then executed with our small Python interpreter on the set of inputs passed along with your tools.
|
||||
We hear you screaming "Arbitrary code execution!" in the back, but let us explain why that is not the case.
|
||||
|
||||
The only functions that can be called are the tools you provided and the print function, so you're already
|
||||
limited in what can be executed. You should be safe if it's limited to Hugging Face tools.
|
||||
|
||||
Then, we don't allow any attribute lookup or imports (which shouldn't be needed anyway for passing along
|
||||
inputs/outputs to a small set of functions) so all the most obvious attacks (and you'd need to prompt the LLM
|
||||
to output them anyway) shouldn't be an issue. If you want to be on the super safe side, you can execute the
|
||||
run() method with the additional argument return_code=True, in which case the agent will just return the code
|
||||
to execute and you can decide whether to do it or not.
|
||||
|
||||
The execution will stop at any line trying to perform an illegal operation or if there is a regular Python error
|
||||
with the code generated by the agent.
|
||||
|
||||
### A curated set of tools
|
||||
|
||||
We identify a set of tools that can empower such agents. Here is an updated list of the tools we have integrated
|
||||
in `transformers`:
|
||||
|
||||
- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
|
||||
- **Text question answering**: given a long text and a question, answer the question in the text ([Flan-T5](./model_doc/flan-t5))
|
||||
- **Unconditional image captioning**: Caption the image! ([BLIP](./model_doc/blip))
|
||||
- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
|
||||
- **Image segmentation**: given an image and a prompt, output the segmentation mask of that prompt ([CLIPSeg](./model_doc/clipseg))
|
||||
- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
|
||||
- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
|
||||
- **Zero-shot text classification**: given a text and a list of labels, identify to which label the text corresponds the most ([BART](./model_doc/bart))
|
||||
- **Text summarization**: summarize a long text in one or a few sentences ([BART](./model_doc/bart))
|
||||
- **Translation**: translate the text into a given language ([NLLB](./model_doc/nllb))
|
||||
|
||||
These tools have an integration in transformers, and can be used manually as well, for example:
|
||||
|
||||
```py
|
||||
from transformers import load_tool
|
||||
|
||||
tool = load_tool("text-to-speech")
|
||||
audio = tool("This is a text to speech tool")
|
||||
```
|
||||
|
||||
### Custom tools
|
||||
|
||||
While we identify a curated set of tools, we strongly believe that the main value provided by this implementation is
|
||||
the ability to quickly create and share custom tools.
|
||||
|
||||
By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool
|
||||
directly with the agent. We've added a few
|
||||
**transformers-agnostic** tools to the [`huggingface-tools` organization](https://huggingface.co/huggingface-tools):
|
||||
|
||||
- **Text downloader**: to download a text from a web URL
|
||||
- **Text to image**: generate an image according to a prompt, leveraging stable diffusion
|
||||
- **Image transformation**: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion
|
||||
- **Text to video**: generate a small video according to a prompt, leveraging damo-vilab
|
||||
|
||||
The text-to-image tool we have been using since the beginning is a remote tool that lives in
|
||||
[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! We will
|
||||
continue releasing such tools on this and other organizations, to further supercharge this implementation.
|
||||
|
||||
The agents have by default access to tools that reside on [`huggingface-tools`](https://huggingface.co/huggingface-tools).
|
||||
We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in [following guide](custom_tools).
|
||||
|
||||
### Code generation
|
||||
|
||||
So far we have shown how to use the agents to perform actions for you. However, the agent is only generating code
|
||||
that we then execute using a very restricted Python interpreter. In case you would like to use the code generated in
|
||||
a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports.
|
||||
|
||||
For example, the following instruction
|
||||
```python
|
||||
agent.run("Draw me a picture of rivers and lakes", return_code=True)
|
||||
```
|
||||
|
||||
returns the following code
|
||||
|
||||
```python
|
||||
from transformers import load_tool
|
||||
|
||||
image_generator = load_tool("huggingface-tools/text-to-image")
|
||||
|
||||
image = image_generator(prompt="rivers and lakes")
|
||||
```
|
||||
|
||||
that you can then modify and execute yourself.
|
@ -76,7 +76,7 @@ Il seguente "Utilizzo in Trainer" prende come esempio mpirun nella libreria Inte
|
||||
|
||||
## Utilizzo in Trainer
|
||||
|
||||
Per abilitare l'addestramento distribuito multi CPU nel Trainer con il ccl backend, gli utenti devono aggiungere **`--ddp_backend ccl`** negli argomenti del comando.
|
||||
Per abilitare l'addestramento distribuito multi CPU nel Trainer con il ccl backend, gli utenti devono aggiungere **`--xpu_backend ccl`** negli argomenti del comando.
|
||||
|
||||
Vediamo un esempio per il [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
|
||||
|
||||
@ -98,7 +98,7 @@ Il seguente comando abilita due processi sul nodo Xeon, con un processo in esecu
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/debug_squad/ \
|
||||
--no_cuda \
|
||||
--ddp_backend ccl \
|
||||
--xpu_backend ccl \
|
||||
--use_ipex
|
||||
```
|
||||
|
||||
@ -131,7 +131,7 @@ A questo punto, esegui il seguente comando nel nodo0 e **4DDP** sarà abilitato
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/debug_squad/ \
|
||||
--no_cuda \
|
||||
--ddp_backend ccl \
|
||||
--xpu_backend ccl \
|
||||
--use_ipex \
|
||||
--bf16
|
||||
```
|
||||
|
@ -4,10 +4,6 @@
|
||||
- local: installation
|
||||
title: インストール
|
||||
title: はじめに
|
||||
- sections:
|
||||
- local: accelerate
|
||||
title: 🤗 Accelerate を用いた分散学習
|
||||
title: チュートリアル
|
||||
- sections:
|
||||
- sections:
|
||||
- local: multilingual
|
||||
|
@ -1,132 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 🤗 Accelerate を用いた分散学習
|
||||
|
||||
モデルが大きくなるにつれて、限られたハードウェアでより大きなモデルを訓練し、訓練速度を大幅に上昇させるための方法として並列処理が浮上してきました。1台のマシンに複数のGPUがあっても、複数のマシンにまたがる複数のGPUがあっても、あらゆるタイプの分散処理セットアップ上でユーザーが簡単に 🤗 Transformers モデルを訓練できるように、 Hugging Face では [🤗 Accelerate](https://huggingface.co/docs/accelerate) ライブラリを作成しました。このチュートリアルでは、PyTorch の訓練ループをカスタマイズして、分散処理環境での訓練を可能にする方法について学びます。
|
||||
|
||||
## セットアップ
|
||||
|
||||
はじめに 🤗 Accelerate をインストールしましょう:
|
||||
|
||||
```bash
|
||||
pip install accelerate
|
||||
```
|
||||
|
||||
そしたらインポートして [`~accelerate.Accelerator`] オブジェクトを作成しましょう。[`~accelerate.Accelerator`] は分散処理セットアップを自動的に検出し、訓練のために必要な全てのコンポーネントを初期化します。モデルをデバイスに明示的に配置する必要はありません。
|
||||
|
||||
```py
|
||||
>>> from accelerate import Accelerator
|
||||
|
||||
>>> accelerator = Accelerator()
|
||||
```
|
||||
|
||||
## Accelerate する準備をしましょう
|
||||
|
||||
次に、関連する全ての訓練オブジェクトを [`~accelerate.Accelerator.prepare`] メソッドに渡します。これには、訓練と評価それぞれのDataloader、モデル、optimizer が含まれます:
|
||||
|
||||
```py
|
||||
>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
|
||||
... train_dataloader, eval_dataloader, model, optimizer
|
||||
... )
|
||||
```
|
||||
|
||||
## Backward
|
||||
|
||||
最後に訓練ループ内の `loss.backward()` を 🤗 Accelerate の [`~accelerate.Accelerator.backward`] メソッドで置き換えます:
|
||||
|
||||
```py
|
||||
>>> for epoch in range(num_epochs):
|
||||
... for batch in train_dataloader:
|
||||
... outputs = model(**batch)
|
||||
... loss = outputs.loss
|
||||
... accelerator.backward(loss)
|
||||
|
||||
... optimizer.step()
|
||||
... lr_scheduler.step()
|
||||
... optimizer.zero_grad()
|
||||
... progress_bar.update(1)
|
||||
```
|
||||
|
||||
以下のコードで確認できる通り、訓練ループに4行のコードを追加するだけで分散学習が可能です!
|
||||
|
||||
```diff
|
||||
+ from accelerate import Accelerator
|
||||
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
|
||||
|
||||
+ accelerator = Accelerator()
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
|
||||
optimizer = AdamW(model.parameters(), lr=3e-5)
|
||||
|
||||
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
- model.to(device)
|
||||
|
||||
+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
|
||||
+ train_dataloader, eval_dataloader, model, optimizer
|
||||
+ )
|
||||
|
||||
num_epochs = 3
|
||||
num_training_steps = num_epochs * len(train_dataloader)
|
||||
lr_scheduler = get_scheduler(
|
||||
"linear",
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=num_training_steps
|
||||
)
|
||||
|
||||
progress_bar = tqdm(range(num_training_steps))
|
||||
|
||||
model.train()
|
||||
for epoch in range(num_epochs):
|
||||
for batch in train_dataloader:
|
||||
- batch = {k: v.to(device) for k, v in batch.items()}
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
- loss.backward()
|
||||
+ accelerator.backward(loss)
|
||||
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
progress_bar.update(1)
|
||||
```
|
||||
|
||||
## 訓練する
|
||||
|
||||
関連するコードを追加したら、スクリプトまたは Colaboratory などのノートブックで訓練を開始します。
|
||||
|
||||
### スクリプトで訓練する
|
||||
|
||||
スクリプトから訓練をしている場合は、設定ファイルを作成・保存するために以下のコマンドを実行してください:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
そして次のようにして訓練を開始します:
|
||||
|
||||
```bash
|
||||
accelerate launch train.py
|
||||
```
|
||||
|
||||
### ノートブックで訓練する
|
||||
|
||||
Colaboratory の TPU の利用をお考えの場合、🤗 Accelerate はノートブック上で実行することもできます。訓練に必要な全てのコードを関数に含め、[`~accelerate.notebook_launcher`] に渡してください:
|
||||
|
||||
```py
|
||||
>>> from accelerate import notebook_launcher
|
||||
|
||||
>>> notebook_launcher(training_function)
|
||||
```
|
||||
|
||||
🤗 Accelerate と豊富な機能についてもっと知りたい方は[ドキュメント](https://huggingface.co/docs/accelerate)を参照してください。
|
@ -8,28 +8,51 @@
|
||||
title: 시작하기
|
||||
- sections:
|
||||
- local: pipeline_tutorial
|
||||
title: Pipeline으로 추론하기
|
||||
title: 추론을 위한 Pipeline
|
||||
- local: autoclass_tutorial
|
||||
title: AutoClass로 사전 학습된 인스턴스 로드하기
|
||||
title: 자동 클래스로 사전 학습된 인스턴스 로드하기
|
||||
- local: preprocessing
|
||||
title: 데이터 전처리하기
|
||||
title: 전처리
|
||||
- local: training
|
||||
title: 사전 학습된 모델 미세 조정하기
|
||||
- local: run_scripts
|
||||
title: 스크립트로 학습하기
|
||||
- local: accelerate
|
||||
title: 🤗 Accelerate로 분산 학습 구성하기
|
||||
- local: model_sharing
|
||||
title: 만든 모델 공유하기
|
||||
title: 튜토리얼
|
||||
title: 🤗 Accelerate를 활용한 분산 학습
|
||||
- local: in_translation
|
||||
title: (번역중) Share a model
|
||||
title: (번역중) 튜토리얼
|
||||
- sections:
|
||||
- sections:
|
||||
- local: create_a_model
|
||||
title: 맞춤형 아키텍처 만들기
|
||||
- local: custom_models
|
||||
title: 사용자 정의 모델 공유하기
|
||||
- local: run_scripts
|
||||
title: 스크립트로 학습하기
|
||||
- local: sagemaker
|
||||
title: Amazon SageMaker에서 학습 실행하기
|
||||
- local: in_translation
|
||||
title: (번역중) Converting from TensorFlow checkpoints
|
||||
- local: serialization
|
||||
title: ONNX로 내보내기
|
||||
- local: in_translation
|
||||
title: (번역중) Export to TorchScript
|
||||
- local: in_translation
|
||||
title: (번역중) Troubleshoot
|
||||
title: (번역중) 일반적인 사용방법
|
||||
- sections:
|
||||
- local: in_translation
|
||||
title: (번역중) Use tokenizers from 🤗 Tokenizers
|
||||
- local: multilingual
|
||||
title: 다국어 모델 추론하기
|
||||
- local: in_translation
|
||||
title: (번역중) Text generation strategies
|
||||
- sections:
|
||||
- local: tasks/sequence_classification
|
||||
title: 텍스트 분류
|
||||
- local: tasks/token_classification
|
||||
title: 토큰 분류
|
||||
- local: tasks/question_answering
|
||||
title: 질의 응답(Question Answering)
|
||||
- local: in_translation
|
||||
title: (번역중) Question answering
|
||||
- local: in_translation
|
||||
title: (번역중) Causal language modeling
|
||||
- local: tasks/masked_language_modeling
|
||||
@ -38,69 +61,40 @@
|
||||
title: 번역
|
||||
- local: tasks/summarization
|
||||
title: 요약
|
||||
- local: tasks/multiple_choice
|
||||
title: 객관식 문제(Multiple Choice)
|
||||
title: 자연어처리
|
||||
isExpanded: false
|
||||
- local: in_translation
|
||||
title: (번역중) Multiple choice
|
||||
title: (번역중) 태스크별 가이드
|
||||
isExpanded: false
|
||||
title: (번역중) 자연어처리
|
||||
- sections:
|
||||
- local: in_translation
|
||||
title: (번역중) Audio classification
|
||||
- local: in_translation
|
||||
title: (번역중) Automatic speech recognition
|
||||
- local: in_translation
|
||||
title: (번역중) Audio classification
|
||||
- local: in_translation
|
||||
title: (번역중) Automatic speech recognition
|
||||
title: (번역중) 오디오
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: tasks/image_classification
|
||||
title: 이미지 분류
|
||||
- local: in_translation
|
||||
title: (번역중) Semantic segmentation
|
||||
- local: in_translation
|
||||
title: (번역중) Video classification
|
||||
- local: in_translation
|
||||
title: (번역중) Object detection
|
||||
- local: in_translation
|
||||
title: (번역중) Zero-shot object detection
|
||||
- local: tasks/zero_shot_image_classification
|
||||
title: 제로샷(zero-shot) 이미지 분류
|
||||
- local: in_translation
|
||||
title: (번역중) Depth estimation
|
||||
- local: in_translation
|
||||
title: (번역중) Image classification
|
||||
- local: in_translation
|
||||
title: (번역중) Semantic segmentation
|
||||
- local: in_translation
|
||||
title: (번역중) Video classification
|
||||
- local: in_translation
|
||||
title: (번역중) Object detection
|
||||
- local: in_translation
|
||||
title: (번역중) Zero-shot object detection
|
||||
- local: in_translation
|
||||
title: (번역중) Zero-shot image classification
|
||||
- local: in_translation
|
||||
title: (번역중) Depth estimation
|
||||
title: (번역중) 컴퓨터 비전
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: tasks/image_captioning
|
||||
title: 이미지 캡셔닝
|
||||
- local: in_translation
|
||||
title: (번역중) Document Question Answering
|
||||
- local: tasks/image_captioning
|
||||
title: 이미지 캡셔닝
|
||||
- local: in_translation
|
||||
title: (번역중) Document Question Answering
|
||||
title: (번역중) 멀티모달
|
||||
isExpanded: false
|
||||
title: 태스크 가이드
|
||||
- sections:
|
||||
- local: in_translation
|
||||
title: (번역중) Use fast tokenizers from 🤗 Tokenizers
|
||||
- local: multilingual
|
||||
title: 다국어 모델 추론하기
|
||||
- local: in_translation
|
||||
title: (번역중) Customize text generation strategy
|
||||
- local: create_a_model
|
||||
title: 모델별 API 사용하기
|
||||
- local: custom_models
|
||||
title: 사용자 정의 모델 공유하기
|
||||
- local: sagemaker
|
||||
title: Amazon SageMaker에서 학습 실행하기
|
||||
- local: serialization
|
||||
title: ONNX로 내보내기
|
||||
- local: torchscript
|
||||
title: TorchScript로 내보내기
|
||||
- local: in_translation
|
||||
title: (번역중) Benchmarks
|
||||
- local: in_translation
|
||||
title: (번역중) Notebooks with examples
|
||||
- local: in_translation
|
||||
title: (번역중) Community resources
|
||||
- local: in_translation
|
||||
title: (번역중) Troubleshoot
|
||||
title: (번역중) 개발자 가이드
|
||||
- sections:
|
||||
- sections:
|
||||
- local: in_translation
|
||||
title: (번역중) Overview
|
||||
- local: in_translation
|
||||
@ -135,8 +129,8 @@
|
||||
title: (번역중) Hyperparameter Search using Trainer API
|
||||
- local: in_translation
|
||||
title: (번역중) XLA Integration for TensorFlow Models
|
||||
title: (번역중) 성능 및 확장성
|
||||
- sections:
|
||||
title: (번역중) 성능 및 확장성
|
||||
- sections:
|
||||
- local: in_translation
|
||||
title: (번역중) How to contribute to transformers?
|
||||
- local: in_translation
|
||||
@ -149,8 +143,16 @@
|
||||
title: (번역중) Testing
|
||||
- local: in_translation
|
||||
title: (번역중) Checks on a Pull Request
|
||||
title: (번역중) 기여하기
|
||||
|
||||
title: (번역중) 기여하기
|
||||
- local: notebooks
|
||||
title: (번역중) 🤗 Transformers Notebooks
|
||||
- local: in_translation
|
||||
title: (번역중) Community resources
|
||||
- local: in_translation
|
||||
title: (번역중) Benchmarks
|
||||
- local: in_translation
|
||||
title: (번역중) Migrating from previous packages
|
||||
title: (번역중) How-to 가이드
|
||||
- sections:
|
||||
- local: in_translation
|
||||
title: (번역중) Philosophy
|
||||
@ -261,8 +263,6 @@
|
||||
title: (번역중) ConvBERT
|
||||
- local: in_translation
|
||||
title: (번역중) CPM
|
||||
- local: in_translation
|
||||
title: (번역중) CPMANT
|
||||
- local: in_translation
|
||||
title: (번역중) CTRL
|
||||
- local: in_translation
|
||||
@ -309,8 +309,6 @@
|
||||
title: (번역중) GPT-J
|
||||
- local: in_translation
|
||||
title: (번역중) GPT2
|
||||
- local: in_translation
|
||||
title: (번역중) GPTBigCode
|
||||
- local: in_translation
|
||||
title: (번역중) GPTSAN Japanese
|
||||
- local: in_translation
|
||||
@ -363,8 +361,6 @@
|
||||
title: (번역중) NLLB-MoE
|
||||
- local: in_translation
|
||||
title: (번역중) Nyströmformer
|
||||
- local: in_translation
|
||||
title: (번역중) Open-Llama
|
||||
- local: in_translation
|
||||
title: (번역중) OPT
|
||||
- local: in_translation
|
||||
@ -464,8 +460,6 @@
|
||||
title: (번역중) EfficientFormer
|
||||
- local: in_translation
|
||||
title: (번역중) EfficientNet
|
||||
- local: in_translation
|
||||
title: (번역중) FocalNet
|
||||
- local: in_translation
|
||||
title: (번역중) GLPN
|
||||
- local: in_translation
|
||||
@ -578,8 +572,6 @@
|
||||
title: (번역중) CLIPSeg
|
||||
- local: in_translation
|
||||
title: (번역중) Data2Vec
|
||||
- local: in_translation
|
||||
title: (번역중) DePlot
|
||||
- local: in_translation
|
||||
title: (번역중) Donut
|
||||
- local: in_translation
|
||||
@ -600,8 +592,6 @@
|
||||
title: (번역중) LiLT
|
||||
- local: in_translation
|
||||
title: (번역중) LXMERT
|
||||
- local: in_translation
|
||||
title: (번역중) MatCha
|
||||
- local: in_translation
|
||||
title: (번역중) MGP-STR
|
||||
- local: in_translation
|
||||
@ -612,8 +602,6 @@
|
||||
title: (번역중) Perceiver
|
||||
- local: in_translation
|
||||
title: (번역중) Pix2Struct
|
||||
- local: in_translation
|
||||
title: (번역중) Segment Anything
|
||||
- local: in_translation
|
||||
title: (번역중) Speech Encoder Decoder Models
|
||||
- local: in_translation
|
||||
|
@ -1,228 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 모델 공유하기[[share-a-model]]
|
||||
|
||||
지난 두 튜토리얼에서 분산 설정을 위해 PyTorch, Keras 및 🤗 Accelerate를 사용하여 모델을 미세 조정하는 방법을 보았습니다. 다음 단계는 모델을 커뮤니티와 공유하는 것입니다! Hugging Face는 인공지능의 민주화를 위해 모두에게 지식과 자원을 공개적으로 공유해야 한다고 믿습니다. 다른 사람들이 시간과 자원을 절약할 수 있도록 커뮤니티에 모델을 공유하는 것을 고려해 보세요.
|
||||
|
||||
이 튜토리얼에서 [Model Hub](https://huggingface.co/models)에서 훈련되거나 미세 조정 모델을 공유하는 두 가지 방법에 대해 알아봅시다:
|
||||
|
||||
- API를 통해 파일을 Hub에 푸시합니다.
|
||||
- 웹사이트를 통해 파일을 Hub로 끌어다 놓습니다.
|
||||
|
||||
<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
|
||||
frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
|
||||
picture-in-picture" allowfullscreen></iframe>
|
||||
|
||||
<Tip>
|
||||
|
||||
커뮤니티에 모델을 공유하려면, [huggingface.co](https://huggingface.co/join)에 계정이 필요합니다. 기존 조직에 가입하거나 새로 만들 수도 있습니다.
|
||||
|
||||
</Tip>
|
||||
|
||||
## 저장소 특징[[repository-features]]
|
||||
|
||||
모델 허브의 각 저장소는 일반적인 GitHub 저장소처럼 작동합니다. 저장소는 버전 관리, 커밋 기록, 차이점 시각화 기능을 제공합니다.
|
||||
|
||||
모델 허브에 내장된 버전 관리는 git 및 [git-lfs](https://git-lfs.github.com/)를 기반으로 합니다. 즉, 하나의 모델을 하나의 저장소로 취급하여 접근 제어 및 확장성이 향상됩니다. 버전 제어는 커밋 해시, 태그 또는 브랜치로 모델의 특정 버전을 고정하는 방법인 *revision*을 허용합니다.
|
||||
|
||||
따라서 `revision` 매개변수를 사용하여 특정 모델 버전을 가져올 수 있습니다:
|
||||
|
||||
```py
|
||||
>>> model = AutoModel.from_pretrained(
|
||||
... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash
|
||||
... )
|
||||
```
|
||||
|
||||
또한 저장소에서 파일을 쉽게 편집할 수 있으며, 커밋 기록과 차이를 볼 수 있습니다:
|
||||
|
||||

|
||||
|
||||
## 설정[[setup]]
|
||||
|
||||
모델을 허브에 공유하기 전에 Hugging Face 자격 증명이 필요합니다. 터미널에 액세스할 수 있는 경우, 🤗 Transformers가 설치된 가상 환경에서 다음 명령을 실행합니다. 그러면 Hugging Face 캐시 폴더(기본적으로 `~/.cache/`)에 액세스 토큰을 저장합니다:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
Jupyter 또는 Colaboratory와 같은 노트북을 사용 중인 경우, [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) 라이브러리가 설치되었는지 확인하세요. 이 라이브러리를 사용하면 API로 허브와 상호 작용할 수 있습니다.
|
||||
|
||||
```bash
|
||||
pip install huggingface_hub
|
||||
```
|
||||
|
||||
그런 다음 `notebook_login`로 허브에 로그인하고, [여기](https://huggingface.co/settings/token) 링크에서 로그인할 토큰을 생성합니다:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import notebook_login
|
||||
|
||||
>>> notebook_login()
|
||||
```
|
||||
|
||||
## 프레임워크 간 모델 변환하기[[convert-a-model-for-all-frameworks]]
|
||||
|
||||
다른 프레임워크로 작업하는 사용자가 모델을 사용할 수 있도록 하려면, PyTorch 및 TensorFlow 체크포인트를 모두 사용하여 모델을 변환하고 업로드하는 것이 좋습니다. 이 단계를 건너뛰어도 사용자는 다른 프레임워크에서 모델을 가져올 수 있지만, 🤗 Transformers가 체크포인트를 즉석에서 변환해야 하므로 속도가 느려질 수 있습니다.
|
||||
|
||||
체크포인트를 다른 프레임워크로 변환하는 것은 쉽습니다. PyTorch 및 TensorFlow가 설치되어 있는지 확인한 다음(설치 지침은 [여기](installation) 참조) 다른 프레임워크에서 작업에 대한 특정 모델을 찾습니다.
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
체크포인트를 TensorFlow에서 PyTorch로 변환하려면 `from_tf=True`를 지정하세요:
|
||||
|
||||
```py
|
||||
>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
|
||||
>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
체크포인트를 PyTorch에서 TensorFlow로 변환하려면 `from_pt=True`를 지정하세요:
|
||||
|
||||
```py
|
||||
>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
|
||||
```
|
||||
|
||||
그런 다음 새로운 체크포인트와 함께 새로운 TensorFlow 모델을 저장할 수 있습니다:
|
||||
|
||||
```py
|
||||
>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
|
||||
```
|
||||
</tf>
|
||||
<jax>
|
||||
Flax에서 모델을 사용하는 경우, PyTorch에서 Flax로 체크포인트를 변환할 수도 있습니다:
|
||||
|
||||
```py
|
||||
>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
|
||||
... "path/to/awesome-name-you-picked", from_pt=True
|
||||
... )
|
||||
```
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
## 훈련 중 모델 푸시하기[[push-a-model-during-training]]
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
<Youtube id="Z1-XMy-GNLQ"/>
|
||||
|
||||
모델을 허브에 공유하는 것은 추가 매개변수나 콜백을 추가하는 것만큼 간단합니다. [미세 조정 튜토리얼](training)에서 [`TrainingArguments`] 클래스는 하이퍼파라미터와 추가 훈련 옵션을 지정하는 곳이라는 것을 기억하세요. 이러한 훈련 옵션 중 하나는 모델을 허브로 직접 푸시하는 기능을 포함합니다. [`TrainingArguments`]에서 `push_to_hub=True`를 설정하세요:
|
||||
|
||||
```py
|
||||
>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
|
||||
```
|
||||
|
||||
평소와 같이 훈련 인수를 [`Trainer`]에 전달합니다:
|
||||
|
||||
```py
|
||||
>>> trainer = Trainer(
|
||||
... model=model,
|
||||
... args=training_args,
|
||||
... train_dataset=small_train_dataset,
|
||||
... eval_dataset=small_eval_dataset,
|
||||
... compute_metrics=compute_metrics,
|
||||
... )
|
||||
```
|
||||
|
||||
모델을 미세 조정한 후, [`Trainer`]에서 [`~transformers.Trainer.push_to_hub`]를 호출하여 훈련된 모델을 허브로 푸시하세요. 🤗 Transformers는 훈련 하이퍼파라미터, 훈련 결과 및 프레임워크 버전을 모델 카드에 자동으로 추가합니다!
|
||||
|
||||
```py
|
||||
>>> trainer.push_to_hub()
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
[`PushToHubCallback`]을 사용하여 모델을 허브에 공유하려면, [`PushToHubCallback`]에 다음 인수를 정의하세요:
|
||||
|
||||
- 출력된 모델의 파일 경로
|
||||
- 토크나이저
|
||||
- `{Hub 사용자 이름}/{모델 이름}` 형식의 `hub_model_id`
|
||||
|
||||
```py
|
||||
>>> from transformers import PushToHubCallback
|
||||
|
||||
>>> push_to_hub_callback = PushToHubCallback(
|
||||
... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
|
||||
... )
|
||||
```
|
||||
|
||||
[`fit`](https://keras.io/api/models/model_training_apis/)에 콜백을 추가하면, 🤗 Transformers가 훈련된 모델을 허브로 푸시합니다:
|
||||
|
||||
```py
|
||||
>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## `push_to_hub` 함수 사용하기[[use-the-pushtohub-function]]
|
||||
|
||||
모델에서 직접 `push_to_hub`를 호출하여 허브에 업로드할 수도 있습니다.
|
||||
|
||||
`push_to_hub`에 모델 이름을 지정하세요:
|
||||
|
||||
```py
|
||||
>>> pt_model.push_to_hub("my-awesome-model")
|
||||
```
|
||||
|
||||
이렇게 하면 사용자 이름 아래에 모델 이름 `my-awesome-model`로 저장소가 생성됩니다. 이제 사용자는 `from_pretrained` 함수를 사용하여 모델을 가져올 수 있습니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModel
|
||||
|
||||
>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
|
||||
```
|
||||
|
||||
조직에 속하고 모델을 조직 이름으로 대신 푸시하려면 `repo_id`에 추가하세요:
|
||||
|
||||
```py
|
||||
>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
|
||||
```
|
||||
|
||||
`push_to_hub` 함수는 모델 저장소에 다른 파일을 추가하는 데에도 사용할 수 있습니다. 예를 들어 모델 저장소에 토크나이저를 추가할 수 있습니다:
|
||||
|
||||
```py
|
||||
>>> tokenizer.push_to_hub("my-awesome-model")
|
||||
```
|
||||
|
||||
또는 미세 조정된 PyTorch 모델의 TensorFlow 버전을 추가할 수도 있습니다:
|
||||
|
||||
```py
|
||||
>>> tf_model.push_to_hub("my-awesome-model")
|
||||
```
|
||||
|
||||
이제 Hugging Face 프로필로 이동하면, 새로 생성한 모델 저장소가 표시됩니다. **Files** 탭을 클릭하면 저장소에 업로드한 모든 파일이 표시됩니다.
|
||||
|
||||
저장소에 파일을 만들고 업로드하는 방법에 대한 자세한 내용은 허브 설명서 [여기](https://huggingface.co/docs/hub/how-to-upstream)를 참조하세요.
|
||||
|
||||
## 웹 인터페이스로 업로드하기[[upload-with-the-web-interface]]
|
||||
|
||||
코드 없는 접근 방식을 선호하는 사용자는 허브의 웹 인터페이스를 통해 모델을 업로드할 수 있습니다. [huggingface.co/new](https://huggingface.co/new)를 방문하여 새로운 저장소를 생성하세요:
|
||||
|
||||

|
||||
|
||||
여기서 모델에 대한 몇 가지 정보를 추가하세요:
|
||||
|
||||
- 저장소의 **소유자**를 선택합니다. 이는 사용자 또는 사용자가 속한 조직일 수 있습니다.
|
||||
- 저장소 이름이 될 모델의 이름을 선택합니다.
|
||||
- 모델이 공개인지 비공개인지 선택합니다.
|
||||
- 모델의 라이센스 사용을 지정합니다.
|
||||
|
||||
이제 **Files** 탭을 클릭하고 **Add file** 버튼을 클릭하여 새로운 파일을 저장소에 업로드합니다. 그런 다음 업로드할 파일을 끌어다 놓고 커밋 메시지를 추가하세요.
|
||||
|
||||

|
||||
|
||||
## 모델 카드 추가하기[[add-a-model-card]]
|
||||
|
||||
사용자가 모델의 기능, 제한, 잠재적 편향 및 윤리적 고려 사항을 이해할 수 있도록 저장소에 모델 카드를 추가하세요. 모델 카드는 `README.md` 파일에 정의되어 있습니다. 다음 방법으로 모델 카드를 추가할 수 있습니다:
|
||||
|
||||
* `README.md` 파일을 수동으로 생성하여 업로드합니다.
|
||||
* 모델 저장소에서 **Edit model card** 버튼을 클릭합니다.
|
||||
|
||||
모델 카드에 포함할 정보 유형에 대한 좋은 예는 DistilBert [모델 카드](https://huggingface.co/distilbert-base-uncased)를 참조하세요. 모델의 탄소 발자국이나 위젯 예시 등 `README.md` 파일에서 제어할 수 있는 다른 옵션에 대한 자세한 내용은 [여기](https://huggingface.co/docs/hub/models-cards) 문서를 참조하세요.
|
1
docs/source/ko/notebooks.mdx
Normal file
1
docs/source/ko/notebooks.mdx
Normal file
@ -0,0 +1 @@
|
||||
# 열심히 번역 중입니다. 조금 이따 만나요!
|
@ -1,542 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 이미지 분류[[image-classification]]
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
<Youtube id="tjAIM7BOYhw"/>
|
||||
|
||||
이미지 분류는 이미지에 레이블 또는 클래스를 할당합니다. 텍스트 또는 오디오 분류와 달리 입력은
|
||||
이미지를 구성하는 픽셀 값입니다. 이미지 분류에는 자연재해 후 피해 감지, 농작물 건강 모니터링, 의료 이미지에서 질병의 징후 검사 지원 등
|
||||
다양한 응용 사례가 있습니다.
|
||||
|
||||
이 가이드에서는 다음을 설명합니다:
|
||||
|
||||
1. [Food-101](https://huggingface.co/datasets/food101) 데이터 세트에서 [ViT](model_doc/vit)를 미세 조정하여 이미지에서 식품 항목을 분류합니다.
|
||||
2. 추론을 위해 미세 조정 모델을 사용합니다.
|
||||
|
||||
<Tip>
|
||||
이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에 의해 지원됩니다:
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
|
||||
<!--End of the generated tip-->
|
||||
|
||||
</Tip>
|
||||
|
||||
시작하기 전에, 필요한 모든 라이브러리가 설치되어 있는지 확인하세요:
|
||||
|
||||
```bash
|
||||
pip install transformers datasets evaluate
|
||||
```
|
||||
|
||||
Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에 공유하는 것을 권장합니다. 메시지가 표시되면, 토큰을 입력하여 로그인하세요:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import notebook_login
|
||||
|
||||
>>> notebook_login()
|
||||
```
|
||||
|
||||
## Food-101 데이터 세트 가져오기[[load-food101-dataset]]
|
||||
|
||||
🤗 Datasets 라이브러리에서 Food-101 데이터 세트의 더 작은 부분 집합을 가져오는 것으로 시작합니다. 이렇게 하면 전체 데이터 세트에 대한
|
||||
훈련에 많은 시간을 할애하기 전에 실험을 통해 모든 것이 제대로 작동하는지 확인할 수 있습니다.
|
||||
|
||||
```py
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> food = load_dataset("food101", split="train[:5000]")
|
||||
```
|
||||
|
||||
데이터 세트의 `train`을 [`~datasets.Dataset.train_test_split`] 메소드를 사용하여 훈련 및 테스트 세트로 분할하세요:
|
||||
|
||||
```py
|
||||
>>> food = food.train_test_split(test_size=0.2)
|
||||
```
|
||||
|
||||
그리고 예시를 살펴보세요:
|
||||
|
||||
```py
|
||||
>>> food["train"][0]
|
||||
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F52AFC8AC50>,
|
||||
'label': 79}
|
||||
```
|
||||
|
||||
데이터 세트의 각 예제에는 두 개의 필드가 있습니다:
|
||||
|
||||
- `image`: 식품 항목의 PIL 이미지
|
||||
- `label`: 식품 항목의 레이블 클래스
|
||||
|
||||
모델이 레이블 ID에서 레이블 이름을 쉽게 가져올 수 있도록
|
||||
레이블 이름을 정수로 매핑하고, 정수를 레이블 이름으로 매핑하는 사전을 만드세요:
|
||||
|
||||
```py
|
||||
>>> labels = food["train"].features["label"].names
|
||||
>>> label2id, id2label = dict(), dict()
|
||||
>>> for i, label in enumerate(labels):
|
||||
... label2id[label] = str(i)
|
||||
... id2label[str(i)] = label
|
||||
```
|
||||
|
||||
이제 레이블 ID를 레이블 이름으로 변환할 수 있습니다:
|
||||
|
||||
```py
|
||||
>>> id2label[str(79)]
|
||||
'prime_rib'
|
||||
```
|
||||
|
||||
## 전처리[[preprocess]]
|
||||
|
||||
다음 단계는 이미지를 텐서로 처리하기 위해 ViT 이미지 프로세서를 가져오는 것입니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoImageProcessor
|
||||
|
||||
>>> checkpoint = "google/vit-base-patch16-224-in21k"
|
||||
>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
|
||||
```
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
이미지에 몇 가지 이미지 변환을 적용하여 과적합에 대해 모델을 더 견고하게 만듭니다. 여기서 Torchvision의 [`transforms`](https://pytorch.org/vision/stable/transforms.html) 모듈을 사용하지만, 원하는 이미지 라이브러리를 사용할 수도 있습니다.
|
||||
|
||||
이미지의 임의 부분을 크롭하고 크기를 조정한 다음, 이미지 평균과 표준 편차로 정규화하세요:
|
||||
|
||||
```py
|
||||
>>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
|
||||
|
||||
>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
|
||||
>>> size = (
|
||||
... image_processor.size["shortest_edge"]
|
||||
... if "shortest_edge" in image_processor.size
|
||||
... else (image_processor.size["height"], image_processor.size["width"])
|
||||
... )
|
||||
>>> _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])
|
||||
```
|
||||
|
||||
그런 다음 전처리 함수를 만들어 변환을 적용하고 이미지의 `pixel_values`(모델에 대한 입력)를 반환하세요:
|
||||
|
||||
```py
|
||||
>>> def transforms(examples):
|
||||
... examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
|
||||
... del examples["image"]
|
||||
... return examples
|
||||
```
|
||||
|
||||
전체 데이터 세트에 전처리 기능을 적용하려면 🤗 Datasets [`~datasets.Dataset.with_transform`]을 사용합니다. 데이터 세트의 요소를 가져올 때 변환이 즉시 적용됩니다:
|
||||
|
||||
```py
|
||||
>>> food = food.with_transform(transforms)
|
||||
```
|
||||
|
||||
이제 [`DefaultDataCollator`]를 사용하여 예제 배치를 만듭니다. 🤗 Transformers의 다른 데이터 콜레이터와 달리, `DefaultDataCollator`는 패딩과 같은 추가적인 전처리를 적용하지 않습니다.
|
||||
|
||||
```py
|
||||
>>> from transformers import DefaultDataCollator
|
||||
|
||||
>>> data_collator = DefaultDataCollator()
|
||||
```
|
||||
</pt>
|
||||
</frameworkcontent>
|
||||
|
||||
|
||||
<frameworkcontent>
|
||||
<tf>
|
||||
|
||||
과적합을 방지하고 모델을 보다 견고하게 만들기 위해 데이터 세트의 훈련 부분에 데이터 증강을 추가합니다.
|
||||
여기서 Keras 전처리 레이어로 훈련 데이터에 대한 변환(데이터 증강 포함)과
|
||||
검증 데이터에 대한 변환(중앙 크로핑, 크기 조정, 정규화만)을 정의합니다.
|
||||
`tf.image` 또는 다른 원하는 라이브러리를 사용할 수 있습니다.
|
||||
|
||||
```py
|
||||
>>> from tensorflow import keras
|
||||
>>> from tensorflow.keras import layers
|
||||
|
||||
>>> size = (image_processor.size["height"], image_processor.size["width"])
|
||||
|
||||
>>> train_data_augmentation = keras.Sequential(
|
||||
... [
|
||||
... layers.RandomCrop(size[0], size[1]),
|
||||
... layers.Rescaling(scale=1.0 / 127.5, offset=-1),
|
||||
... layers.RandomFlip("horizontal"),
|
||||
... layers.RandomRotation(factor=0.02),
|
||||
... layers.RandomZoom(height_factor=0.2, width_factor=0.2),
|
||||
... ],
|
||||
... name="train_data_augmentation",
|
||||
... )
|
||||
|
||||
>>> val_data_augmentation = keras.Sequential(
|
||||
... [
|
||||
... layers.CenterCrop(size[0], size[1]),
|
||||
... layers.Rescaling(scale=1.0 / 127.5, offset=-1),
|
||||
... ],
|
||||
... name="val_data_augmentation",
|
||||
... )
|
||||
```
|
||||
|
||||
다음으로 한 번에 하나의 이미지가 아니라 이미지 배치에 적절한 변환을 적용하는 함수를 만듭니다.
|
||||
|
||||
```py
|
||||
>>> import numpy as np
|
||||
>>> import tensorflow as tf
|
||||
>>> from PIL import Image
|
||||
|
||||
|
||||
>>> def convert_to_tf_tensor(image: Image):
|
||||
... np_image = np.array(image)
|
||||
... tf_image = tf.convert_to_tensor(np_image)
|
||||
... # `expand_dims()` is used to add a batch dimension since
|
||||
... # the TF augmentation layers operates on batched inputs.
|
||||
... return tf.expand_dims(tf_image, 0)
|
||||
|
||||
|
||||
>>> def preprocess_train(example_batch):
|
||||
... """Apply train_transforms across a batch."""
|
||||
... images = [
|
||||
... train_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
|
||||
... ]
|
||||
... example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
|
||||
... return example_batch
|
||||
|
||||
|
||||
... def preprocess_val(example_batch):
|
||||
... """Apply val_transforms across a batch."""
|
||||
... images = [
|
||||
... val_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
|
||||
... ]
|
||||
... example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
|
||||
... return example_batch
|
||||
```
|
||||
|
||||
🤗 Datasets [`~datasets.Dataset.set_transform`]를 사용하여 즉시 변환을 적용하세요:
|
||||
|
||||
```py
|
||||
food["train"].set_transform(preprocess_train)
|
||||
food["test"].set_transform(preprocess_val)
|
||||
```
|
||||
|
||||
최종 전처리 단계로 `DefaultDataCollator`를 사용하여 예제 배치를 만듭니다. 🤗 Transformers의 다른 데이터 콜레이터와 달리
|
||||
`DefaultDataCollator`는 패딩과 같은 추가 전처리를 적용하지 않습니다.
|
||||
|
||||
```py
|
||||
>>> from transformers import DefaultDataCollator
|
||||
|
||||
>>> data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## 평가[[evaluate]]
|
||||
|
||||
훈련 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다.
|
||||
🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리로 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는
|
||||
[accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 평가 지표를 가져옵니다. (🤗 Evaluate [빠른 둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하여 평가 지표를 가져오고 계산하는 방법에 대해 자세히 알아보세요):
|
||||
|
||||
```py
|
||||
>>> import evaluate
|
||||
|
||||
>>> accuracy = evaluate.load("accuracy")
|
||||
```
|
||||
|
||||
그런 다음 예측과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 정확도를 계산하는 함수를 만듭니다:
|
||||
|
||||
```py
|
||||
>>> import numpy as np
|
||||
|
||||
|
||||
>>> def compute_metrics(eval_pred):
|
||||
... predictions, labels = eval_pred
|
||||
... predictions = np.argmax(predictions, axis=1)
|
||||
... return accuracy.compute(predictions=predictions, references=labels)
|
||||
```
|
||||
|
||||
이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정하면 이 함수로 되돌아올 것입니다.
|
||||
|
||||
## 훈련[[train]]
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
<Tip>
|
||||
|
||||
[`Trainer`]를 사용하여 모델을 미세 조정하는 방법에 익숙하지 않은 경우, [여기](../training#train-with-pytorch-trainer)에서 기본 튜토리얼을 확인하세요!
|
||||
|
||||
</Tip>
|
||||
|
||||
이제 모델을 훈련시킬 준비가 되었습니다! [`AutoModelForImageClassification`]로 ViT를 가져옵니다. 예상되는 레이블 수, 레이블 매핑 및 레이블 수를 지정하세요:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
|
||||
|
||||
>>> model = AutoModelForImageClassification.from_pretrained(
|
||||
... checkpoint,
|
||||
... num_labels=len(labels),
|
||||
... id2label=id2label,
|
||||
... label2id=label2id,
|
||||
... )
|
||||
```
|
||||
|
||||
이제 세 단계만 거치면 끝입니다:
|
||||
|
||||
1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정의하세요. `image` 열이 삭제되기 때문에 미사용 열을 제거하지 않는 것이 중요합니다. `image` 열이 없으면 `pixel_values`을 생성할 수 없습니다. 이 동작을 방지하려면 `remove_unused_columns=False`로 설정하세요! 다른 유일한 필수 매개변수는 모델 저장 위치를 지정하는 `output_dir`입니다. `push_to_hub=True`로 설정하면 이 모델을 허브에 푸시합니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다). 각 에폭이 끝날 때마다, [`Trainer`]가 정확도를 평가하고 훈련 체크포인트를 저장합니다.
|
||||
2. [`Trainer`]에 모델, 데이터 세트, 토크나이저, 데이터 콜레이터 및 `compute_metrics` 함수와 함께 훈련 인수를 전달하세요.
|
||||
3. [`~Trainer.train`]을 호출하여 모델을 미세 조정하세요.
|
||||
|
||||
```py
|
||||
>>> training_args = TrainingArguments(
|
||||
... output_dir="my_awesome_food_model",
|
||||
... remove_unused_columns=False,
|
||||
... evaluation_strategy="epoch",
|
||||
... save_strategy="epoch",
|
||||
... learning_rate=5e-5,
|
||||
... per_device_train_batch_size=16,
|
||||
... gradient_accumulation_steps=4,
|
||||
... per_device_eval_batch_size=16,
|
||||
... num_train_epochs=3,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
... push_to_hub=True,
|
||||
... )
|
||||
|
||||
>>> trainer = Trainer(
|
||||
... model=model,
|
||||
... args=training_args,
|
||||
... data_collator=data_collator,
|
||||
... train_dataset=food["train"],
|
||||
... eval_dataset=food["test"],
|
||||
... tokenizer=image_processor,
|
||||
... compute_metrics=compute_metrics,
|
||||
... )
|
||||
|
||||
>>> trainer.train()
|
||||
```
|
||||
|
||||
훈련이 완료되면, 모든 사람이 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드로 모델을 허브에 공유하세요:
|
||||
|
||||
```py
|
||||
>>> trainer.push_to_hub()
|
||||
```
|
||||
</pt>
|
||||
</frameworkcontent>
|
||||
|
||||
<frameworkcontent>
|
||||
<tf>
|
||||
|
||||
<Tip>
|
||||
|
||||
Keras를 사용하여 모델을 미세 조정하는 방법에 익숙하지 않은 경우, 먼저 [기본 튜토리얼](./training#train-a-tensorflow-model-with-keras)을 확인하세요!
|
||||
|
||||
</Tip>
|
||||
|
||||
TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요:
|
||||
1. 훈련 하이퍼파라미터를 정의하고 옵티마이저와 학습률 스케쥴을 설정합니다.
|
||||
2. 사전 훈련된 모델을 인스턴스화합니다.
|
||||
3. 🤗 Dataset을 `tf.data.Dataset`으로 변환합니다.
|
||||
4. 모델을 컴파일합니다.
|
||||
5. 콜백을 추가하고 훈련을 수행하기 위해 `fit()` 메소드를 사용합니다.
|
||||
6. 커뮤니티와 공유하기 위해 모델을 🤗 Hub에 업로드합니다.
|
||||
|
||||
하이퍼파라미터, 옵티마이저 및 학습률 스케쥴을 정의하는 것으로 시작합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import create_optimizer
|
||||
|
||||
>>> batch_size = 16
|
||||
>>> num_epochs = 5
|
||||
>>> num_train_steps = len(food["train"]) * num_epochs
|
||||
>>> learning_rate = 3e-5
|
||||
>>> weight_decay_rate = 0.01
|
||||
|
||||
>>> optimizer, lr_schedule = create_optimizer(
|
||||
... init_lr=learning_rate,
|
||||
... num_train_steps=num_train_steps,
|
||||
... weight_decay_rate=weight_decay_rate,
|
||||
... num_warmup_steps=0,
|
||||
... )
|
||||
```
|
||||
|
||||
그런 다음 레이블 매핑과 함께 [`TFAuto ModelForImageClassification`]으로 ViT를 가져옵니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForImageClassification
|
||||
|
||||
>>> model = TFAutoModelForImageClassification.from_pretrained(
|
||||
... checkpoint,
|
||||
... id2label=id2label,
|
||||
... label2id=label2id,
|
||||
... )
|
||||
```
|
||||
|
||||
데이터 세트를 [`~datasets.Dataset.to_tf_dataset`]와 `data_collator`를 사용하여 `tf.data.Dataset` 형식으로 변환하세요:
|
||||
|
||||
```py
|
||||
>>> # converting our train dataset to tf.data.Dataset
|
||||
>>> tf_train_dataset = food["train"].to_tf_dataset(
|
||||
... columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
|
||||
... )
|
||||
|
||||
>>> # converting our test dataset to tf.data.Dataset
|
||||
>>> tf_eval_dataset = food["test"].to_tf_dataset(
|
||||
... columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
|
||||
... )
|
||||
```
|
||||
|
||||
`compile()`를 사용하여 훈련 모델을 구성하세요:
|
||||
|
||||
```py
|
||||
>>> from tensorflow.keras.losses import SparseCategoricalCrossentropy
|
||||
|
||||
>>> loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||
>>> model.compile(optimizer=optimizer, loss=loss)
|
||||
```
|
||||
|
||||
예측에서 정확도를 계산하고 모델을 🤗 Hub로 푸시하려면 [Keras callbacks](../main_classes/keras_callbacks)를 사용하세요.
|
||||
`compute_metrics` 함수를 [KerasMetricCallback](../main_classes/keras_callbacks#transformers.KerasMetricCallback)에 전달하고,
|
||||
[PushToHubCallback](../main_classes/keras_callbacks#transformers.PushToHubCallback)을 사용하여 모델을 업로드합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
|
||||
|
||||
>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
|
||||
>>> push_to_hub_callback = PushToHubCallback(
|
||||
... output_dir="food_classifier",
|
||||
... tokenizer=image_processor,
|
||||
... save_strategy="no",
|
||||
... )
|
||||
>>> callbacks = [metric_callback, push_to_hub_callback]
|
||||
```
|
||||
|
||||
이제 모델을 훈련할 준비가 되었습니다! 훈련 및 검증 데이터 세트, 에폭 수와 함께 `fit()`을 호출하고,
|
||||
콜백을 사용하여 모델을 미세 조정합니다:
|
||||
|
||||
```py
|
||||
>>> model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks)
|
||||
Epoch 1/5
|
||||
250/250 [==============================] - 313s 1s/step - loss: 2.5623 - val_loss: 1.4161 - accuracy: 0.9290
|
||||
Epoch 2/5
|
||||
250/250 [==============================] - 265s 1s/step - loss: 0.9181 - val_loss: 0.6808 - accuracy: 0.9690
|
||||
Epoch 3/5
|
||||
250/250 [==============================] - 252s 1s/step - loss: 0.3910 - val_loss: 0.4303 - accuracy: 0.9820
|
||||
Epoch 4/5
|
||||
250/250 [==============================] - 251s 1s/step - loss: 0.2028 - val_loss: 0.3191 - accuracy: 0.9900
|
||||
Epoch 5/5
|
||||
250/250 [==============================] - 238s 949ms/step - loss: 0.1232 - val_loss: 0.3259 - accuracy: 0.9890
|
||||
```
|
||||
|
||||
축하합니다! 모델을 미세 조정하고 🤗 Hub에 공유했습니다. 이제 추론에 사용할 수 있습니다!
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
이미지 분류를 위한 모델을 미세 조정하는 자세한 예제는 다음 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)을 참조하세요.
|
||||
|
||||
</Tip>
|
||||
|
||||
## 추론[[inference]]
|
||||
|
||||
좋아요, 이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
|
||||
|
||||
추론을 수행하고자 하는 이미지를 가져와봅시다:
|
||||
|
||||
```py
|
||||
>>> ds = load_dataset("food101", split="validation[:10]")
|
||||
>>> image = ds["image"][0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" alt="image of beignets"/>
|
||||
</div>
|
||||
|
||||
미세 조정 모델로 추론을 시도하는 가장 간단한 방법은 [`pipeline`]을 사용하는 것입니다. 모델로 이미지 분류를 위한 `pipeline`을 인스턴스화하고 이미지를 전달합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> classifier = pipeline("image-classification", model="my_awesome_food_model")
|
||||
>>> classifier(image)
|
||||
[{'score': 0.31856709718704224, 'label': 'beignets'},
|
||||
{'score': 0.015232225880026817, 'label': 'bruschetta'},
|
||||
{'score': 0.01519392803311348, 'label': 'chicken_wings'},
|
||||
{'score': 0.013022331520915031, 'label': 'pork_chop'},
|
||||
{'score': 0.012728818692266941, 'label': 'prime_rib'}]
|
||||
```
|
||||
|
||||
원한다면, `pipeline`의 결과를 수동으로 복제할 수도 있습니다:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
이미지를 전처리하기 위해 이미지 프로세서를 가져오고 `input`을 PyTorch 텐서로 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoImageProcessor
|
||||
>>> import torch
|
||||
|
||||
>>> image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model")
|
||||
>>> inputs = image_processor(image, return_tensors="pt")
|
||||
```
|
||||
|
||||
입력을 모델에 전달하고 logits을 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForImageClassification
|
||||
|
||||
>>> model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model")
|
||||
>>> with torch.no_grad():
|
||||
... logits = model(**inputs).logits
|
||||
```
|
||||
|
||||
확률이 가장 높은 예측 레이블을 가져오고, 모델의 `id2label` 매핑을 사용하여 레이블로 변환합니다:
|
||||
|
||||
```py
|
||||
>>> predicted_label = logits.argmax(-1).item()
|
||||
>>> model.config.id2label[predicted_label]
|
||||
'beignets'
|
||||
```
|
||||
</pt>
|
||||
</frameworkcontent>
|
||||
|
||||
<frameworkcontent>
|
||||
<tf>
|
||||
이미지를 전처리하기 위해 이미지 프로세서를 가져오고 `input`을 TensorFlow 텐서로 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoImageProcessor
|
||||
|
||||
>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/food_classifier")
|
||||
>>> inputs = image_processor(image, return_tensors="tf")
|
||||
```
|
||||
|
||||
입력을 모델에 전달하고 logits을 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForImageClassification
|
||||
|
||||
>>> model = TFAutoModelForImageClassification.from_pretrained("MariaK/food_classifier")
|
||||
>>> logits = model(**inputs).logits
|
||||
```
|
||||
|
||||
확률이 가장 높은 예측 레이블을 가져오고, 모델의 `id2label` 매핑을 사용하여 레이블로 변환합니다:
|
||||
|
||||
```py
|
||||
>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
|
||||
>>> model.config.id2label[predicted_class_id]
|
||||
'beignets'
|
||||
```
|
||||
|
||||
</tf>
|
||||
</frameworkcontent>
|
@ -1,461 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 객관식 문제[[multiple-choice]]
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
객관식 과제는 문맥과 함께 여러 개의 후보 답변이 제공되고 모델이 정답을 선택하도록 학습된다는 점을 제외하면 질의응답과 유사합니다.
|
||||
|
||||
진행하는 방법은 아래와 같습니다:
|
||||
|
||||
1. [SWAG](https://huggingface.co/datasets/swag) 데이터 세트의 'regular' 구성으로 [BERT](https://huggingface.co/bert-base-uncased)를 미세 조정하여 여러 옵션과 일부 컨텍스트가 주어졌을 때 가장 적합한 답을 선택합니다.
|
||||
2. 추론에 미세 조정된 모델을 사용합니다.
|
||||
|
||||
<Tip>
|
||||
이 튜토리얼에서 설명하는 작업은 다음 모델 아키텍처에서 지원됩니다:
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
|
||||
<!--End of the generated tip-->
|
||||
|
||||
</Tip>
|
||||
|
||||
시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
|
||||
|
||||
```bash
|
||||
pip install transformers datasets evaluate
|
||||
```
|
||||
|
||||
모델을 업로드하고 커뮤니티와 공유할 수 있도록 허깅페이스 계정에 로그인하는 것이 좋습니다. 메시지가 표시되면 토큰을 입력하여 로그인합니다:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import notebook_login
|
||||
|
||||
>>> notebook_login()
|
||||
```
|
||||
|
||||
## SWAG 데이터 세트 가져오기[[load-swag-dataset]]
|
||||
|
||||
먼저 🤗 Datasets 라이브러리에서 SWAG 데이터셋의 '일반' 구성을 가져옵니다:
|
||||
|
||||
```py
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> swag = load_dataset("swag", "regular")
|
||||
```
|
||||
|
||||
이제 데이터를 살펴봅니다:
|
||||
|
||||
```py
|
||||
>>> swag["train"][0]
|
||||
{'ending0': 'passes by walking down the street playing their instruments.',
|
||||
'ending1': 'has heard approaching them.',
|
||||
'ending2': "arrives and they're outside dancing and asleep.",
|
||||
'ending3': 'turns the lead singer watches the performance.',
|
||||
'fold-ind': '3416',
|
||||
'gold-source': 'gold',
|
||||
'label': 0,
|
||||
'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
|
||||
'sent2': 'A drum line',
|
||||
'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
|
||||
'video-id': 'anetv_jkn6uvmqwh4'}
|
||||
```
|
||||
|
||||
여기에는 많은 필드가 있는 것처럼 보이지만 실제로는 매우 간단합니다:
|
||||
|
||||
- `sent1` 및 `sent2`: 이 필드는 문장이 어떻게 시작되는지 보여주며, 이 두 필드를 합치면 `시작 구절(startphrase)` 필드가 됩니다.
|
||||
- `종료 구절(ending)`: 문장이 어떻게 끝날 수 있는지에 대한 가능한 종료 구절를 제시하지만 그 중 하나만 정답입니다.
|
||||
- `레이블(label)`: 올바른 문장 종료 구절을 식별합니다.
|
||||
|
||||
## 전처리[[preprocess]]
|
||||
|
||||
다음 단계는 문장의 시작과 네 가지 가능한 구절을 처리하기 위해 BERT 토크나이저를 불러옵니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
|
||||
생성하려는 전처리 함수는 다음과 같아야 합니다:
|
||||
|
||||
1. `sent1` 필드를 네 개 복사한 다음 각각을 `sent2`와 결합하여 문장이 시작되는 방식을 재현합니다.
|
||||
2. `sent2`를 네 가지 가능한 문장 구절 각각과 결합합니다.
|
||||
3. 이 두 목록을 토큰화할 수 있도록 평탄화(flatten)하고, 각 예제에 해당하는 `input_ids`, `attention_mask` 및 `labels` 필드를 갖도록 다차원화(unflatten) 합니다.
|
||||
|
||||
```py
|
||||
>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
|
||||
|
||||
|
||||
>>> def preprocess_function(examples):
|
||||
... first_sentences = [[context] * 4 for context in examples["sent1"]]
|
||||
... question_headers = examples["sent2"]
|
||||
... second_sentences = [
|
||||
... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
|
||||
... ]
|
||||
|
||||
... first_sentences = sum(first_sentences, [])
|
||||
... second_sentences = sum(second_sentences, [])
|
||||
|
||||
... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
|
||||
... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
|
||||
```
|
||||
|
||||
전체 데이터 집합에 전처리 기능을 적용하려면 🤗 Datasets [`~datasets.Dataset.map`] 메소드를 사용합니다. `batched=True`를 설정하여 데이터 집합의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다:
|
||||
|
||||
```py
|
||||
tokenized_swag = swag.map(preprocess_function, batched=True)
|
||||
```
|
||||
|
||||
🤗 Transformers에는 객관식용 데이터 콜레이터가 없으므로 예제 배치를 만들려면 [`DataCollatorWithPadding`]을 조정해야 합니다. 데이터 정렬 중에 전체 데이터 집합을 최대 길이로 패딩하는 대신 배치 중 가장 긴 길이로 문장을 *동적 패딩*하는 것이 더 효율적입니다.
|
||||
|
||||
`DataCollatorForMultipleChoice`는 모든 모델 입력을 평탄화하고 패딩을 적용하며 그 결과를 결과를 다차원화합니다:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import torch
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
|
||||
... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
|
||||
... batch["labels"] = torch.tensor(labels, dtype=torch.int64)
|
||||
... return batch
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
```py
|
||||
>>> from dataclasses import dataclass
|
||||
>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
|
||||
>>> from typing import Optional, Union
|
||||
>>> import tensorflow as tf
|
||||
|
||||
|
||||
>>> @dataclass
|
||||
... class DataCollatorForMultipleChoice:
|
||||
... """
|
||||
... Data collator that will dynamically pad the inputs for multiple choice received.
|
||||
... """
|
||||
|
||||
... tokenizer: PreTrainedTokenizerBase
|
||||
... padding: Union[bool, str, PaddingStrategy] = True
|
||||
... max_length: Optional[int] = None
|
||||
... pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
... def __call__(self, features):
|
||||
... label_name = "label" if "label" in features[0].keys() else "labels"
|
||||
... labels = [feature.pop(label_name) for feature in features]
|
||||
... batch_size = len(features)
|
||||
... num_choices = len(features[0]["input_ids"])
|
||||
... flattened_features = [
|
||||
... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||
... ]
|
||||
... flattened_features = sum(flattened_features, [])
|
||||
|
||||
... batch = self.tokenizer.pad(
|
||||
... flattened_features,
|
||||
... padding=self.padding,
|
||||
... max_length=self.max_length,
|
||||
... pad_to_multiple_of=self.pad_to_multiple_of,
|
||||
... return_tensors="tf",
|
||||
... )
|
||||
|
||||
... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
|
||||
... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
|
||||
... return batch
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## 평가 하기[[evaluate]]
|
||||
|
||||
훈련 중에 메트릭을 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 🤗[Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하여 평가 방법을 빠르게 가져올 수 있습니다. 이 작업에서는 [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 지표를 가져옵니다(🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하여 지표를 가져오고 계산하는 방법에 대해 자세히 알아보세요):
|
||||
|
||||
```py
|
||||
>>> import evaluate
|
||||
|
||||
>>> accuracy = evaluate.load("accuracy")
|
||||
```
|
||||
|
||||
그리고 예측과 레이블을 [`~evaluate.EvaluationModule.compute`]에 전달하여 정확도를 계산하는 함수를 만듭니다:
|
||||
|
||||
```py
|
||||
>>> import numpy as np
|
||||
|
||||
|
||||
>>> def compute_metrics(eval_pred):
|
||||
... predictions, labels = eval_pred
|
||||
... predictions = np.argmax(predictions, axis=1)
|
||||
... return accuracy.compute(predictions=predictions, references=labels)
|
||||
```
|
||||
|
||||
이제 `compute_metrics` 함수를 사용할 준비가 되었으며, 훈련을 설정할 때 이 함수로 돌아가게 됩니다.
|
||||
|
||||
## 훈련 하기[[train]]
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
<Tip>
|
||||
|
||||
[`Trainer`]로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-with-pytorch-trainer)를 살펴보세요!
|
||||
|
||||
</Tip>
|
||||
|
||||
이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForMultipleChoice`]로 BERT를 로드합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
|
||||
|
||||
>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
|
||||
이제 세 단계만 남았습니다:
|
||||
|
||||
1. 훈련 하이퍼파라미터를 [`TrainingArguments`]에 정의합니다. 유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. `push_to_hub=True`를 설정하여 이 모델을 허브에 푸시합니다(모델을 업로드하려면 허깅 페이스에 로그인해야 합니다). 각 에폭이 끝날 때마다 [`Trainer`]가 정확도를 평가하고 훈련 체크포인트를 저장합니다.
|
||||
2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터, `compute_metrics` 함수와 함께 훈련 인자를 [`Trainer`]에 전달합니다.
|
||||
3. [`~Trainer.train`]을 사용하여 모델을 미세 조정합니다.
|
||||
|
||||
```py
|
||||
>>> training_args = TrainingArguments(
|
||||
... output_dir="my_awesome_swag_model",
|
||||
... evaluation_strategy="epoch",
|
||||
... save_strategy="epoch",
|
||||
... load_best_model_at_end=True,
|
||||
... learning_rate=5e-5,
|
||||
... per_device_train_batch_size=16,
|
||||
... per_device_eval_batch_size=16,
|
||||
... num_train_epochs=3,
|
||||
... weight_decay=0.01,
|
||||
... push_to_hub=True,
|
||||
... )
|
||||
|
||||
>>> trainer = Trainer(
|
||||
... model=model,
|
||||
... args=training_args,
|
||||
... train_dataset=tokenized_swag["train"],
|
||||
... eval_dataset=tokenized_swag["validation"],
|
||||
... tokenizer=tokenizer,
|
||||
... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
|
||||
... compute_metrics=compute_metrics,
|
||||
... )
|
||||
|
||||
>>> trainer.train()
|
||||
```
|
||||
|
||||
훈련이 완료되면 모든 사람이 모델을 사용할 수 있도록 [`~transformers.Trainer.push_to_hub`] 메소드를 사용하여 모델을 허브에 공유하세요:
|
||||
|
||||
```py
|
||||
>>> trainer.push_to_hub()
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
<Tip>
|
||||
|
||||
Keras로 모델을 미세 조정하는 데 익숙하지 않다면 기본 튜토리얼 [여기](../training#train-a-tensorflow-model-with-keras)를 살펴보시기 바랍니다!
|
||||
|
||||
</Tip>
|
||||
TensorFlow에서 모델을 미세 조정하려면 최적화 함수, 학습률 스케쥴 및 몇 가지 학습 하이퍼파라미터를 설정하는 것부터 시작하세요:
|
||||
|
||||
```py
|
||||
>>> from transformers import create_optimizer
|
||||
|
||||
>>> batch_size = 16
|
||||
>>> num_train_epochs = 2
|
||||
>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
|
||||
>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
|
||||
```
|
||||
|
||||
그리고 [`TFAutoModelForMultipleChoice`]로 BERT를 가져올 수 있습니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForMultipleChoice
|
||||
|
||||
>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
|
||||
[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용하여 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
|
||||
|
||||
```py
|
||||
>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
|
||||
>>> tf_train_set = model.prepare_tf_dataset(
|
||||
... tokenized_swag["train"],
|
||||
... shuffle=True,
|
||||
... batch_size=batch_size,
|
||||
... collate_fn=data_collator,
|
||||
... )
|
||||
|
||||
>>> tf_validation_set = model.prepare_tf_dataset(
|
||||
... tokenized_swag["validation"],
|
||||
... shuffle=False,
|
||||
... batch_size=batch_size,
|
||||
... collate_fn=data_collator,
|
||||
... )
|
||||
```
|
||||
|
||||
[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)을 사용하여 훈련 모델을 구성합니다:
|
||||
|
||||
```py
|
||||
>>> model.compile(optimizer=optimizer)
|
||||
```
|
||||
|
||||
훈련을 시작하기 전에 설정해야 할 마지막 두 가지는 예측의 정확도를 계산하고 모델을 허브로 푸시하는 방법을 제공하는 것입니다. 이 두 가지 작업은 모두 [Keras 콜백](../main_classes/keras_callbacks)을 사용하여 수행할 수 있습니다.
|
||||
|
||||
`compute_metrics`함수를 [`~transformers.KerasMetricCallback`]에 전달하세요:
|
||||
|
||||
```py
|
||||
>>> from transformers.keras_callbacks import KerasMetricCallback
|
||||
|
||||
>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
|
||||
```
|
||||
|
||||
모델과 토크나이저를 업로드할 위치를 [`~transformers.PushToHubCallback`]에서 지정하세요:
|
||||
|
||||
```py
|
||||
>>> from transformers.keras_callbacks import PushToHubCallback
|
||||
|
||||
>>> push_to_hub_callback = PushToHubCallback(
|
||||
... output_dir="my_awesome_model",
|
||||
... tokenizer=tokenizer,
|
||||
... )
|
||||
```
|
||||
|
||||
그리고 콜백을 함께 묶습니다:
|
||||
|
||||
```py
|
||||
>>> callbacks = [metric_callback, push_to_hub_callback]
|
||||
```
|
||||
|
||||
이제 모델 훈련을 시작합니다! 훈련 및 검증 데이터 세트, 에폭 수, 콜백을 사용하여 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하고 모델을 미세 조정합니다:
|
||||
|
||||
```py
|
||||
>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
|
||||
```
|
||||
|
||||
훈련이 완료되면 모델이 자동으로 허브에 업로드되어 누구나 사용할 수 있습니다!
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
객관식 모델을 미세 조정하는 방법에 대한 보다 심층적인 예는 아래 문서를 참조하세요.
|
||||
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
|
||||
또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
|
||||
|
||||
</Tip>
|
||||
|
||||
## 추론 하기[[inference]]
|
||||
|
||||
이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
|
||||
|
||||
텍스트와 두 개의 후보 답안을 작성합니다:
|
||||
|
||||
```py
|
||||
>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
|
||||
>>> candidate1 = "The law does not apply to croissants and brioche."
|
||||
>>> candidate2 = "The law applies to baguettes."
|
||||
```
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
각 프롬프트와 후보 답변 쌍을 토큰화하여 PyTorch 텐서를 반환합니다. 또한 `labels`을 생성해야 합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
|
||||
>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
|
||||
>>> labels = torch.tensor(0).unsqueeze(0)
|
||||
```
|
||||
|
||||
입력과 레이블을 모델에 전달하고 `logits`을 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForMultipleChoice
|
||||
|
||||
>>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
|
||||
>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
|
||||
>>> logits = outputs.logits
|
||||
```
|
||||
|
||||
가장 높은 확률을 가진 클래스를 가져옵니다:
|
||||
|
||||
```py
|
||||
>>> predicted_class = logits.argmax().item()
|
||||
>>> predicted_class
|
||||
'0'
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
각 프롬프트와 후보 답안 쌍을 토큰화하여 텐서플로 텐서를 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
|
||||
>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
|
||||
```
|
||||
|
||||
모델에 입력을 전달하고 `logits`를 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForMultipleChoice
|
||||
|
||||
>>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
|
||||
>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
|
||||
>>> outputs = model(inputs)
|
||||
>>> logits = outputs.logits
|
||||
```
|
||||
|
||||
가장 높은 확률을 가진 클래스를 가져옵니다:
|
||||
|
||||
```py
|
||||
>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
|
||||
>>> predicted_class
|
||||
'0'
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
@ -1,424 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 질의 응답(Question Answering)[[question-answering]]
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
<Youtube id="ajPx5LwJD-I"/>
|
||||
|
||||
질의 응답 태스크는 주어진 질문에 대한 답변을 제공합니다. Alexa, Siri 또는 Google과 같은 가상 비서에게 날씨가 어떤지 물어본 적이 있다면 질의 응답 모델을 사용해본 적이 있을 것입니다. 질의 응답 태스크에는 일반적으로 두 가지 유형이 있습니다.
|
||||
|
||||
- 추출적(Extractive) 질의 응답: 주어진 문맥에서 답변을 추출합니다.
|
||||
- 생성적(Abstractive) 질의 응답: 문맥에서 질문에 올바르게 답하는 답변을 생성합니다.
|
||||
|
||||
이 가이드는 다음과 같은 방법들을 보여줍니다.
|
||||
|
||||
1. 추출적 질의 응답을 하기 위해 [SQuAD](https://huggingface.co/datasets/squad) 데이터 세트에서 [DistilBERT](https://huggingface.co/distilbert-base-uncased) 미세 조정하기
|
||||
2. 추론에 미세 조정된 모델 사용하기
|
||||
|
||||
<Tip>
|
||||
이 튜토리얼에서 설명하는 태스크는 다음과 같은 모델 아키텍처에서 지원됩니다.
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
|
||||
|
||||
<!--End of the generated tip-->
|
||||
|
||||
</Tip>
|
||||
|
||||
시작하기 전에, 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
|
||||
|
||||
```bash
|
||||
pip install transformers datasets evaluate
|
||||
```
|
||||
|
||||
여러분의 모델을 업로드하고 커뮤니티에 공유할 수 있도록 Hugging Face 계정에 로그인하는 것이 좋습니다. 메시지가 표시되면 토큰을 입력해서 로그인합니다:
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import notebook_login
|
||||
|
||||
>>> notebook_login()
|
||||
```
|
||||
|
||||
## SQuAD 데이터 세트 가져오기[[load-squad-dataset]]
|
||||
|
||||
먼저 🤗 Datasets 라이브러리에서 SQuAD 데이터 세트의 일부를 가져옵니다. 이렇게 하면 전체 데이터 세트로 훈련하며 더 많은 시간을 할애하기 전에 모든 것이 잘 작동하는지 실험하고 확인할 수 있습니다.
|
||||
|
||||
```py
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> squad = load_dataset("squad", split="train[:5000]")
|
||||
```
|
||||
|
||||
데이터 세트의 분할된 `train`을 [`~datasets.Dataset.train_test_split`] 메소드를 사용해 훈련 데이터 세트와 테스트 데이터 세트로 나누어줍니다:
|
||||
|
||||
```py
|
||||
>>> squad = squad.train_test_split(test_size=0.2)
|
||||
```
|
||||
|
||||
그리고나서 예시로 데이터를 하나 살펴봅니다:
|
||||
|
||||
```py
|
||||
>>> squad["train"][0]
|
||||
{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
|
||||
'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
|
||||
'id': '5733be284776f41900661182',
|
||||
'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
|
||||
'title': 'University_of_Notre_Dame'
|
||||
}
|
||||
```
|
||||
|
||||
이 중에서 몇 가지 중요한 항목이 있습니다:
|
||||
|
||||
- `answers`: 답안 토큰의 시작 위치와 답안 텍스트
|
||||
- `context`: 모델이 답을 추출하는데 필요한 배경 지식
|
||||
- `question`: 모델이 답해야 하는 질문
|
||||
|
||||
## 전처리[[preprocess]]
|
||||
|
||||
<Youtube id="qgaM0weJHpA"/>
|
||||
|
||||
다음 단계에서는 `question` 및 `context` 항목을 처리하기 위해 DistilBERT 토크나이저를 가져옵니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
질의 응답 태스크와 관련해서 특히 유의해야할 몇 가지 전처리 단계가 있습니다:
|
||||
|
||||
1. 데이터 세트의 일부 예제에는 모델의 최대 입력 길이를 초과하는 매우 긴 `context`가 있을 수 있습니다. 긴 시퀀스를 다루기 위해서는, `truncation="only_second"`로 설정해 `context`만 잘라내면 됩니다.
|
||||
2. 그 다음, `return_offset_mapping=True`로 설정해 답변의 시작과 종료 위치를 원래의 `context`에 매핑합니다.
|
||||
3. 매핑을 완료하면, 이제 답변에서 시작 토큰과 종료 토큰을 찾을 수 있습니다. 오프셋의 어느 부분이 `question`과 `context`에 해당하는지 찾을 수 있도록 [`~tokenizers.Encoding.sequence_ids`] 메소드를 사용하세요.
|
||||
|
||||
다음은 `answer`의 시작 토큰과 종료 토큰을 잘라내서 `context`에 매핑하는 함수를 만드는 방법입니다:
|
||||
|
||||
```py
|
||||
>>> def preprocess_function(examples):
|
||||
... questions = [q.strip() for q in examples["question"]]
|
||||
... inputs = tokenizer(
|
||||
... questions,
|
||||
... examples["context"],
|
||||
... max_length=384,
|
||||
... truncation="only_second",
|
||||
... return_offsets_mapping=True,
|
||||
... padding="max_length",
|
||||
... )
|
||||
|
||||
... offset_mapping = inputs.pop("offset_mapping")
|
||||
... answers = examples["answers"]
|
||||
... start_positions = []
|
||||
... end_positions = []
|
||||
|
||||
... for i, offset in enumerate(offset_mapping):
|
||||
... answer = answers[i]
|
||||
... start_char = answer["answer_start"][0]
|
||||
... end_char = answer["answer_start"][0] + len(answer["text"][0])
|
||||
... sequence_ids = inputs.sequence_ids(i)
|
||||
|
||||
... # Find the start and end of the context
|
||||
... idx = 0
|
||||
... while sequence_ids[idx] != 1:
|
||||
... idx += 1
|
||||
... context_start = idx
|
||||
... while sequence_ids[idx] == 1:
|
||||
... idx += 1
|
||||
... context_end = idx - 1
|
||||
|
||||
... # If the answer is not fully inside the context, label it (0, 0)
|
||||
... if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
|
||||
... start_positions.append(0)
|
||||
... end_positions.append(0)
|
||||
... else:
|
||||
... # Otherwise it's the start and end token positions
|
||||
... idx = context_start
|
||||
... while idx <= context_end and offset[idx][0] <= start_char:
|
||||
... idx += 1
|
||||
... start_positions.append(idx - 1)
|
||||
|
||||
... idx = context_end
|
||||
... while idx >= context_start and offset[idx][1] >= end_char:
|
||||
... idx -= 1
|
||||
... end_positions.append(idx + 1)
|
||||
|
||||
... inputs["start_positions"] = start_positions
|
||||
... inputs["end_positions"] = end_positions
|
||||
... return inputs
|
||||
```
|
||||
|
||||
모든 데이터 세트에 전처리를 적용하려면, 🤗 Datasets [`~datasets.Dataset.map`] 함수를 사용하세요. `batched=True`로 설정해 데이터 세트의 여러 요소들을 한 번에 처리하면 `map` 함수의 속도를 빠르게 할 수 있습니다. 필요하지 않은 열은 모두 제거합니다:
|
||||
|
||||
```py
|
||||
>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
|
||||
```
|
||||
|
||||
이제 [`DefaultDataCollator`]를 이용해 예시 배치를 생성합니다. 🤗 Transformers의 다른 데이터 콜레이터(data collator)와 달리, [`DefaultDataCollator`]는 패딩과 같은 추가 전처리를 적용하지 않습니다:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
```py
|
||||
>>> from transformers import DefaultDataCollator
|
||||
|
||||
>>> data_collator = DefaultDataCollator()
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
```py
|
||||
>>> from transformers import DefaultDataCollator
|
||||
|
||||
>>> data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## 훈련[[train]]
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
<Tip>
|
||||
|
||||
[`Trainer`]를 이용해 모델을 미세 조정하는 것에 익숙하지 않다면, [여기](../training#train-with-pytorch-trainer)에서 기초 튜토리얼을 살펴보세요!
|
||||
|
||||
</Tip>
|
||||
|
||||
이제 모델 훈련을 시작할 준비가 되었습니다! [`AutoModelForQuestionAnswering`]으로 DistilBERT를 가져옵니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
|
||||
|
||||
>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
이제 세 단계만 남았습니다:
|
||||
|
||||
1. [`TrainingArguments`]에서 훈련 하이퍼파라미터를 정합니다. 꼭 필요한 매개변수는 모델을 저장할 위치를 지정하는 `output_dir` 입니다. `push_to_hub=True`로 설정해서 이 모델을 Hub로 푸시합니다 (모델을 업로드하려면 Hugging Face에 로그인해야 합니다).
|
||||
2. 모델, 데이터 세트, 토크나이저, 데이터 콜레이터와 함께 [`Trainer`]에 훈련 인수들을 전달합니다.
|
||||
3. [`~Trainer.train`]을 호출해서 모델을 미세 조정합니다.
|
||||
|
||||
```py
|
||||
>>> training_args = TrainingArguments(
|
||||
... output_dir="my_awesome_qa_model",
|
||||
... evaluation_strategy="epoch",
|
||||
... learning_rate=2e-5,
|
||||
... per_device_train_batch_size=16,
|
||||
... per_device_eval_batch_size=16,
|
||||
... num_train_epochs=3,
|
||||
... weight_decay=0.01,
|
||||
... push_to_hub=True,
|
||||
... )
|
||||
|
||||
>>> trainer = Trainer(
|
||||
... model=model,
|
||||
... args=training_args,
|
||||
... train_dataset=tokenized_squad["train"],
|
||||
... eval_dataset=tokenized_squad["test"],
|
||||
... tokenizer=tokenizer,
|
||||
... data_collator=data_collator,
|
||||
... )
|
||||
|
||||
>>> trainer.train()
|
||||
```
|
||||
|
||||
훈련이 완료되면, [`~transformers.Trainer.push_to_hub`] 매소드를 사용해 모델을 Hub에 공유해서 모든 사람들이 사용할 수 있게 공유해주세요:
|
||||
|
||||
```py
|
||||
>>> trainer.push_to_hub()
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
<Tip>
|
||||
|
||||
Keras로 모델을 미세 조정하는 것에 익숙하지 않다면, [여기](../training#train-a-tensorflow-model-with-keras)에서 기초 튜토리얼을 살펴보세요!
|
||||
|
||||
</Tip>
|
||||
TensorFlow를 이용한 모델을 미세 조정하려면 옵티마이저 함수, 학습률 스케쥴 및 몇 가지 훈련 하이퍼파라미터를 설정하는 것부터 시작해야합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import create_optimizer
|
||||
|
||||
>>> batch_size = 16
|
||||
>>> num_epochs = 2
|
||||
>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
|
||||
>>> optimizer, schedule = create_optimizer(
|
||||
... init_lr=2e-5,
|
||||
... num_warmup_steps=0,
|
||||
... num_train_steps=total_train_steps,
|
||||
... )
|
||||
```
|
||||
|
||||
그 다음 [`TFAutoModelForQuestionAnswering`]으로 DistilBERT를 가져옵니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForQuestionAnswering
|
||||
|
||||
>>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
[`~transformers.TFPreTrainedModel.prepare_tf_dataset`]을 사용해서 데이터 세트를 `tf.data.Dataset` 형식으로 변환합니다:
|
||||
|
||||
```py
|
||||
>>> tf_train_set = model.prepare_tf_dataset(
|
||||
... tokenized_squad["train"],
|
||||
... shuffle=True,
|
||||
... batch_size=16,
|
||||
... collate_fn=data_collator,
|
||||
... )
|
||||
|
||||
>>> tf_validation_set = model.prepare_tf_dataset(
|
||||
... tokenized_squad["test"],
|
||||
... shuffle=False,
|
||||
... batch_size=16,
|
||||
... collate_fn=data_collator,
|
||||
... )
|
||||
```
|
||||
|
||||
[`compile`](https://keras.io/api/models/model_training_apis/#compile-method)로 훈련할 모델을 설정합니다:
|
||||
|
||||
```py
|
||||
>>> import tensorflow as tf
|
||||
|
||||
>>> model.compile(optimizer=optimizer)
|
||||
```
|
||||
|
||||
마지막으로 모델을 Hub로 푸시할 방법을 설정합니다. [`~transformers.PushToHubCallback`]에서 모델과 토크나이저를 푸시할 경로를 설정합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers.keras_callbacks import PushToHubCallback
|
||||
|
||||
>>> callback = PushToHubCallback(
|
||||
... output_dir="my_awesome_qa_model",
|
||||
... tokenizer=tokenizer,
|
||||
... )
|
||||
```
|
||||
|
||||
드디어 모델 훈련을 시작할 준비가 되었습니다! 훈련 데이터 세트와 평가 데이터 세트, 에폭 수, 콜백을 설정한 후 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 이용해 모델을 미세 조정합니다:
|
||||
|
||||
```py
|
||||
>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
|
||||
```
|
||||
훈련이 완료되면 모델이 자동으로 Hub에 업로드되어 누구나 사용할 수 있습니다!
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
<Tip>
|
||||
|
||||
질의 응답을 위해 모델을 미세 조정하는 방법에 대한 더 자세한 예시는 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) 또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)을 참조하세요.
|
||||
|
||||
</Tip>
|
||||
|
||||
## 평가[[evaluate]]
|
||||
|
||||
질의 응답을 평가하려면 상당한 양의 후처리가 필요합니다. 시간이 너무 많이 걸리지 않도록 이 가이드에서는 평가 단계를 생략합니다. [`Trainer`]는 훈련 과정에서 평가 손실(evaluation loss)을 계속 계산하기 때문에 모델의 성능을 대략적으로 알 수 있습니다.
|
||||
|
||||
시간에 여유가 있고 질의 응답 모델을 평가하는 방법에 관심이 있다면 🤗 Hugging Face Course의 [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) 챕터를 살펴보세요!
|
||||
|
||||
## 추론[[inference]]
|
||||
|
||||
이제 모델을 미세 조정했으니 추론에 사용할 수 있습니다!
|
||||
|
||||
질문과 모델이 예측하기 원하는 문맥(context)를 생각해보세요:
|
||||
|
||||
```py
|
||||
>>> question = "How many programming languages does BLOOM support?"
|
||||
>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
|
||||
```
|
||||
|
||||
추론을 위해 미세 조정한 모델을 테스트하는 가장 쉬운 방법은 [`pipeline`]을 사용하는 것 입니다. 모델을 사용해 질의 응답을 하기 위해서 `pipeline`을 인스턴스화하고 텍스트를 입력합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
|
||||
>>> question_answerer(question=question, context=context)
|
||||
{'score': 0.2058267742395401,
|
||||
'start': 10,
|
||||
'end': 95,
|
||||
'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}
|
||||
```
|
||||
|
||||
원한다면 `pipeline`의 결과를 직접 복제할 수도 있습니다:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
텍스트를 토큰화해서 PyTorch 텐서를 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
|
||||
>>> inputs = tokenizer(question, context, return_tensors="pt")
|
||||
```
|
||||
|
||||
모델에 입력을 전달하고 `logits`을 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForQuestionAnswering
|
||||
|
||||
>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
|
||||
>>> with torch.no_grad():
|
||||
... outputs = model(**inputs)
|
||||
```
|
||||
|
||||
모델의 출력에서 시작 및 종료 위치가 어딘지 가장 높은 확률을 얻습니다:
|
||||
|
||||
```py
|
||||
>>> answer_start_index = outputs.start_logits.argmax()
|
||||
>>> answer_end_index = outputs.end_logits.argmax()
|
||||
```
|
||||
|
||||
예측된 토큰을 해독해서 답을 얻습니다:
|
||||
|
||||
```py
|
||||
>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
|
||||
>>> tokenizer.decode(predict_answer_tokens)
|
||||
'176 billion parameters and can generate text in 46 languages natural languages and 13'
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
텍스트를 토큰화해서 TensorFlow 텐서를 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
|
||||
>>> inputs = tokenizer(question, text, return_tensors="tf")
|
||||
```
|
||||
|
||||
모델에 입력을 전달하고 `logits`을 반환합니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForQuestionAnswering
|
||||
|
||||
>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
모델의 출력에서 시작 및 종료 위치가 어딘지 가장 높은 확률을 얻습니다:
|
||||
|
||||
```py
|
||||
>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
|
||||
>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
|
||||
```
|
||||
|
||||
예측된 토큰을 해독해서 답을 얻습니다:
|
||||
|
||||
```py
|
||||
>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
|
||||
>>> tokenizer.decode(predict_answer_tokens)
|
||||
'176 billion parameters and can generate text in 46 languages natural languages and 13'
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
@ -1,140 +0,0 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 제로샷(zero-shot) 이미지 분류[[zeroshot-image-classification]]
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
제로샷(zero-shot) 이미지 분류는 특정 카테고리의 예시가 포함된 데이터를 학습되지 않은 모델을 사용해 이미지 분류를 수행하는 작업입니다.
|
||||
|
||||
일반적으로 이미지 분류를 위해서는 레이블이 달린 특정 이미지 데이터로 모델 학습이 필요하며, 이 모델은 특정 이미지의 특징을 레이블에 "매핑"하는 방법을 학습합니다.
|
||||
새로운 레이블이 있는 분류 작업에 이러한 모델을 사용해야 하는 경우에는, 모델을 "재보정"하기 위해 미세 조정이 필요합니다.
|
||||
|
||||
이와 대조적으로, 제로샷 또는 개방형 어휘(open vocabulary) 이미지 분류 모델은 일반적으로 대규모 이미지 데이터와 해당 설명에 대해 학습된 멀티모달(multimodal) 모델입니다.
|
||||
이러한 모델은 제로샷 이미지 분류를 포함한 많은 다운스트림 작업에 사용할 수 있는 정렬된(aligned) 비전 언어 표현을 학습합니다.
|
||||
|
||||
이는 이미지 분류에 대한 보다 유연한 접근 방식으로, 추가 학습 데이터 없이 새로운 레이블이나 학습하지 못한 카테고리에 대해 모델을 일반화할 수 있습니다.
|
||||
또한, 사용자가 대상 개체에 대한 자유 형식의 텍스트 설명으로 이미지를 검색할 수 있습니다.
|
||||
|
||||
이번 가이드에서 배울 내용은 다음과 같습니다:
|
||||
|
||||
* 제로샷 이미지 분류 파이프라인 만들기
|
||||
* 직접 제로샷 이미지 분류 모델 추론 실행하기
|
||||
|
||||
시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요:
|
||||
|
||||
```bash
|
||||
pip install -q transformers
|
||||
```
|
||||
|
||||
## 제로샷(zero-shot) 이미지 분류 파이프라인[[zeroshot-image-classification-pipeline]]
|
||||
|
||||
[`pipeline`]을 활용하면 가장 간단하게 제로샷 이미지 분류를 지원하는 모델로 추론해볼 수 있습니다.
|
||||
[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads)에서 파이프라인을 인스턴스화합니다.
|
||||
|
||||
```python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> checkpoint = "openai/clip-vit-large-patch14"
|
||||
>>> detector = pipeline(model=checkpoint, task="zero-shot-image-classification")
|
||||
```
|
||||
|
||||
다음으로, 분류하고 싶은 이미지를 선택하세요.
|
||||
|
||||
```py
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
|
||||
>>> url = "https://unsplash.com/photos/g8oS8-82DxI/download?ixid=MnwxMjA3fDB8MXx0b3BpY3x8SnBnNktpZGwtSGt8fHx8fDJ8fDE2NzgxMDYwODc&force=true&w=640"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/owl.jpg" alt="Photo of an owl"/>
|
||||
</div>
|
||||
|
||||
이미지와 해당 이미지의 후보 레이블인 `candidate_labels`를 파이프라인으로 전달합니다.
|
||||
여기서는 이미지를 직접 전달하지만, 컴퓨터에 저장된 이미지의 경로나 url로 전달할 수도 있습니다.
|
||||
`candidate_labels`는 이 예시처럼 간단한 단어일 수도 있고 좀 더 설명적인 단어일 수도 있습니다.
|
||||
|
||||
```py
|
||||
>>> predictions = classifier(image, candidate_labels=["fox", "bear", "seagull", "owl"])
|
||||
>>> predictions
|
||||
[{'score': 0.9996670484542847, 'label': 'owl'},
|
||||
{'score': 0.000199399160919711, 'label': 'seagull'},
|
||||
{'score': 7.392891711788252e-05, 'label': 'fox'},
|
||||
{'score': 5.96074532950297e-05, 'label': 'bear'}]
|
||||
```
|
||||
|
||||
## 직접 제로샷(zero-shot) 이미지 분류하기[[zeroshot-image-classification-by-hand]]
|
||||
|
||||
이제 제로샷 이미지 분류 파이프라인 사용 방법을 살펴보았으니, 실행하는 방법을 살펴보겠습니다.
|
||||
|
||||
[Hugging Face Hub에 업로드된 체크포인트](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads)에서 모델과 프로세서를 가져오는 것으로 시작합니다.
|
||||
여기서는 이전과 동일한 체크포인트를 사용하겠습니다:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
|
||||
|
||||
>>> model = AutoModelForZeroShotImageClassification.from_pretrained(checkpoint)
|
||||
>>> processor = AutoProcessor.from_pretrained(checkpoint)
|
||||
```
|
||||
|
||||
다른 이미지를 사용해 보겠습니다.
|
||||
|
||||
```py
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
|
||||
>>> url = "https://unsplash.com/photos/xBRQfR2bqNI/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjc4Mzg4ODEx&force=true&w=640"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg" alt="Photo of a car"/>
|
||||
</div>
|
||||
|
||||
프로세서를 사용해 모델의 입력을 준비합니다.
|
||||
프로세서는 모델의 입력으로 사용하기 위해 이미지 크기를 변환하고 정규화하는 이미지 프로세서와 텍스트 입력을 처리하는 토크나이저로 구성됩니다.
|
||||
|
||||
```py
|
||||
>>> candidate_labels = ["tree", "car", "bike", "cat"]
|
||||
>>> inputs = processor(images=image, text=candidate_labels, return_tensors="pt", padding=True)
|
||||
```
|
||||
|
||||
모델에 입력을 전달하고, 결과를 후처리합니다:
|
||||
|
||||
```py
|
||||
>>> import torch
|
||||
|
||||
>>> with torch.no_grad():
|
||||
... outputs = model(**inputs)
|
||||
|
||||
>>> logits = outputs.logits_per_image[0]
|
||||
>>> probs = logits.softmax(dim=-1).numpy()
|
||||
>>> scores = probs.tolist()
|
||||
|
||||
>>> result = [
|
||||
... {"score": score, "label": candidate_label}
|
||||
... for score, candidate_label in sorted(zip(probs, candidate_labels), key=lambda x: -x[0])
|
||||
... ]
|
||||
|
||||
>>> result
|
||||
[{'score': 0.998572, 'label': 'car'},
|
||||
{'score': 0.0010570387, 'label': 'bike'},
|
||||
{'score': 0.0003393686, 'label': 'tree'},
|
||||
{'score': 3.1572064e-05, 'label': 'cat'}]
|
||||
```
|
@ -1,185 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# TorchScript로 내보내기[[export-to-torchscript]]
|
||||
|
||||
<Tip>
|
||||
|
||||
TorchScript를 활용한 실험은 아직 초기 단계로, 가변적인 입력 크기 모델들을 통해 그 기능성을 계속 탐구하고 있습니다.
|
||||
이 기능은 저희가 관심을 두고 있는 분야 중 하나이며,
|
||||
앞으로 출시될 버전에서 더 많은 코드 예제, 더 유연한 구현, 그리고 Python 기반 코드와 컴파일된 TorchScript를 비교하는 벤치마크를 등을 통해 분석을 심화할 예정입니다.
|
||||
|
||||
</Tip>
|
||||
|
||||
[TorchScript 문서](https://pytorch.org/docs/stable/jit.html)에서는 이렇게 말합니다.
|
||||
|
||||
> TorchScript는 PyTorch 코드에서 직렬화 및 최적화 가능한 모델을 생성하는 방법입니다.
|
||||
|
||||
[JIT과 TRACE](https://pytorch.org/docs/stable/jit.html)는 개발자가 모델을 내보내서 효율 지향적인 C++ 프로그램과 같은 다른 프로그램에서 재사용할 수 있도록 하는 PyTorch 모듈입니다.
|
||||
|
||||
PyTorch 기반 Python 프로그램과 다른 환경에서 모델을 재사용할 수 있도록, 🤗 Transformers 모델을 TorchScript로 내보낼 수 있는 인터페이스를 제공합니다.
|
||||
이 문서에서는 TorchScript를 사용하여 모델을 내보내고 사용하는 방법을 설명합니다.
|
||||
|
||||
모델을 내보내려면 두 가지가 필요합니다:
|
||||
|
||||
- `torchscript` 플래그로 모델 인스턴스화
|
||||
- 더미 입력을 사용한 순전파(forward pass)
|
||||
|
||||
이 필수 조건들은 아래에 자세히 설명된 것처럼 개발자들이 주의해야 할 여러 사항들을 의미합니다.
|
||||
|
||||
## TorchScript 플래그와 묶인 가중치(tied weights)[[torchscript-flag-and-tied-weights]]
|
||||
|
||||
`torchscript` 플래그가 필요한 이유는 대부분의 🤗 Transformers 언어 모델에서 `Embedding` 레이어와 `Decoding` 레이어 간의 묶인 가중치(tied weights)가 존재하기 때문입니다.
|
||||
TorchScript는 묶인 가중치를 가진 모델을 내보낼 수 없으므로, 미리 가중치를 풀고 복제해야 합니다.
|
||||
|
||||
`torchscript` 플래그로 인스턴스화된 모델은 `Embedding` 레이어와 `Decoding` 레이어가 분리되어 있으므로 이후에 훈련해서는 안 됩니다.
|
||||
훈련을 하게 되면 두 레이어 간 동기화가 해제되어 예상치 못한 결과가 발생할 수 있습니다.
|
||||
|
||||
언어 모델 헤드를 갖지 않은 모델은 가중치가 묶여 있지 않아서 이 문제가 발생하지 않습니다.
|
||||
이러한 모델들은 `torchscript` 플래그 없이 안전하게 내보낼 수 있습니다.
|
||||
|
||||
## 더미 입력과 표준 길이[[dummy-inputs-and-standard-lengths]]
|
||||
|
||||
더미 입력(dummy inputs)은 모델의 순전파(forward pass)에 사용됩니다.
|
||||
입력 값이 레이어를 통해 전파되는 동안, PyTorch는 각 텐서에서 실행된 다른 연산을 추적합니다.
|
||||
이러한 기록된 연산은 모델의 *추적(trace)*을 생성하는 데 사용됩니다.
|
||||
|
||||
추적은 입력의 차원을 기준으로 생성됩니다.
|
||||
따라서 더미 입력의 차원에 제한되어, 다른 시퀀스 길이나 배치 크기에서는 작동하지 않습니다.
|
||||
다른 크기로 시도할 경우 다음과 같은 오류가 발생합니다:
|
||||
|
||||
```
|
||||
`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
|
||||
```
|
||||
추론 중 모델에 공급될 가장 큰 입력만큼 큰 더미 입력 크기로 모델을 추적하는 것이 좋습니다.
|
||||
패딩은 누락된 값을 채우는 데 도움이 될 수 있습니다.
|
||||
그러나 모델이 더 큰 입력 크기로 추적되기 때문에, 행렬의 차원이 커지고 계산량이 많아집니다.
|
||||
|
||||
다양한 시퀀스 길이 모델을 내보낼 때는 각 입력에 대해 수행되는 총 연산 횟수에 주의하고 성능을 주의 깊게 확인하세요.
|
||||
|
||||
## Python에서 TorchScript 사용하기[[using-torchscript-in-python]]
|
||||
|
||||
이 섹션에서는 모델을 저장하고 가져오는 방법, 추적을 사용하여 추론하는 방법을 보여줍니다.
|
||||
|
||||
### 모델 저장하기[[saving-a-model]]
|
||||
|
||||
`BertModel`을 TorchScript로 내보내려면 `BertConfig` 클래스에서 `BertModel`을 인스턴스화한 다음, `traced_bert.pt`라는 파일명으로 디스크에 저장하면 됩니다.
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
|
||||
enc = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
# 입력 텍스트 토큰화하기
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = enc.tokenize(text)
|
||||
|
||||
# 입력 토큰 중 하나를 마스킹하기
|
||||
masked_index = 8
|
||||
tokenized_text[masked_index] = "[MASK]"
|
||||
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
|
||||
|
||||
# 더미 입력 만들기
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
dummy_input = [tokens_tensor, segments_tensors]
|
||||
|
||||
# torchscript 플래그로 모델 초기화하기
|
||||
# 이 모델은 LM 헤드가 없으므로 필요하지 않지만, 플래그를 True로 설정합니다.
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=32000,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
torchscript=True,
|
||||
)
|
||||
|
||||
# 모델을 인스턴트화하기
|
||||
model = BertModel(config)
|
||||
|
||||
# 모델을 평가 모드로 두어야 합니다.
|
||||
model.eval()
|
||||
|
||||
# 만약 *from_pretrained*를 사용하여 모델을 인스턴스화하는 경우, TorchScript 플래그를 쉽게 설정할 수 있습니다
|
||||
model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
|
||||
|
||||
# 추적 생성하기
|
||||
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
torch.jit.save(traced_model, "traced_bert.pt")
|
||||
```
|
||||
|
||||
### 모델 가져오기[[loading-a-model]]
|
||||
|
||||
이제 이전에 저장한 `BertModel`, 즉 `traced_bert.pt`를 디스크에서 가져오고, 이전에 초기화한 `dummy_input`에서 사용할 수 있습니다.
|
||||
|
||||
```python
|
||||
loaded_model = torch.jit.load("traced_bert.pt")
|
||||
loaded_model.eval()
|
||||
|
||||
all_encoder_layers, pooled_output = loaded_model(*dummy_input)
|
||||
```
|
||||
|
||||
### 추적된 모델을 사용하여 추론하기[[using-a-traced-model-for-inference]]
|
||||
|
||||
`__call__` 이중 언더스코어(dunder) 메소드를 사용하여 추론에 추적된 모델을 사용하세요:
|
||||
|
||||
```python
|
||||
traced_model(tokens_tensor, segments_tensors)
|
||||
```
|
||||
|
||||
## Neuron SDK로 Hugging Face TorchScript 모델을 AWS에 배포하기[[deploy-hugging-face-torchscript-models-to-aws-with-the-neuron-sdk]]
|
||||
|
||||
AWS가 클라우드에서 저비용, 고성능 머신 러닝 추론을 위한 [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) 인스턴스 제품군을 출시했습니다.
|
||||
Inf1 인스턴스는 딥러닝 추론 워크로드에 특화된 맞춤 하드웨어 가속기인 AWS Inferentia 칩으로 구동됩니다.
|
||||
[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#)은 Inferentia를 위한 SDK로, Inf1에 배포하기 위한 transformers 모델 추적 및 최적화를 지원합니다.
|
||||
Neuron SDK는 다음과 같은 기능을 제공합니다:
|
||||
|
||||
1. 코드 한 줄만 변경하면 클라우드 추론를 위해 TorchScript 모델을 추적하고 최적화할 수 있는 쉬운 API
|
||||
2. 즉시 사용 가능한 성능 최적화로 [비용 효율 향상](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>)
|
||||
3. [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) 또는 [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html)로 구축된 Hugging Face transformers 모델 지원
|
||||
|
||||
### 시사점[[implications]]
|
||||
|
||||
[BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert) 아키텍처 또는 그 변형인 [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) 및 [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta)를 기반으로 한 Transformers 모델은 추출 기반 질의응답, 시퀀스 분류 및 토큰 분류와 같은 비생성 작업 시 Inf1에서 최상의 성능을 보입니다.
|
||||
그러나 텍스트 생성 작업도 [AWS Neuron MarianMT 튜토리얼](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html)을 따라 Inf1에서 실행되도록 조정할 수 있습니다.
|
||||
|
||||
Inferentia에서 바로 변환할 수 있는 모델에 대한 자세한 정보는 Neuron 문서의 [Model Architecture Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) 섹션에서 확인할 수 있습니다.
|
||||
|
||||
### 종속성[[dependencies]]
|
||||
|
||||
AWS Neuron을 사용하여 모델을 변환하려면 [Neuron SDK 환경](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide)이 필요합니다.
|
||||
이는 [AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html)에 미리 구성되어 있습니다.
|
||||
|
||||
### AWS Neuron으로 모델 변환하기[[converting-a-model-for-aws-neuron]]
|
||||
|
||||
`BertModel`을 추적하려면, [Python에서 TorchScript 사용하기](torchscript#using-torchscript-in-python)에서와 동일한 코드를 사용해서 AWS NEURON용 모델을 변환합니다.
|
||||
`torch.neuron` 프레임워크 익스텐션을 가져와 Python API를 통해 Neuron SDK의 구성 요소에 접근합니다:
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
import torch.neuron
|
||||
```
|
||||
|
||||
다음 줄만 수정하면 됩니다:
|
||||
|
||||
```diff
|
||||
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
+ torch.neuron.trace(model, [token_tensor, segments_tensors])
|
||||
```
|
||||
|
||||
이로써 Neuron SDK가 모델을 추적하고 Inf1 인스턴스에 최적화할 수 있게 됩니다.
|
||||
|
||||
AWS Neuron SDK의 기능, 도구, 예제 튜토리얼 및 최신 업데이트에 대해 자세히 알아보려면 [AWS NeuronSDK 문서](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html)를 참조하세요.
|
@ -418,14 +418,13 @@ class FlaxDataCollatorForT5MLM:
|
||||
orig_length = length
|
||||
|
||||
num_noise_tokens = int(np.round(length * self.noise_density))
|
||||
num_nonnoise_tokens = length - num_noise_tokens
|
||||
# avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
|
||||
num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
|
||||
# num_noise_tokens should be less than num_noise_tokens and num_nonnoise_tokens
|
||||
num_noise_spans = int(np.round(min(num_noise_tokens, num_nonnoise_tokens) / self.mean_noise_span_length))
|
||||
num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))
|
||||
|
||||
# avoid degeneracy by ensuring positive number of noise spans
|
||||
num_noise_spans = max(num_noise_spans, 1)
|
||||
num_nonnoise_tokens = length - num_noise_tokens
|
||||
|
||||
# pick the lengths of the noise spans and the non-noise spans
|
||||
def _random_segmentation(num_items, num_segments):
|
||||
|
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, get_full_repo_name, send_examp
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
@ -54,7 +54,7 @@ from transformers.utils import check_min_version, get_full_repo_name, send_examp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
|
||||
|
||||
|
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
|
||||
|
||||
@ -510,8 +510,6 @@ def main():
|
||||
checkpoint = last_checkpoint
|
||||
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
||||
trainer.save_model()
|
||||
tokenizer.save_pretrained(training_args.output_dir)
|
||||
image_processor.save_pretrained(training_args.output_dir)
|
||||
trainer.log_metrics("train", train_result.metrics)
|
||||
trainer.save_metrics("train", train_result.metrics)
|
||||
trainer.save_state()
|
||||
|
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
@ -1,771 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import torch
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from accelerate.utils import set_seed
|
||||
from datasets import load_dataset
|
||||
from huggingface_hub import Repository
|
||||
from torch.utils.data import DataLoader
|
||||
from torchvision.transforms import Compose, Lambda, Normalize, RandomHorizontalFlip, RandomResizedCrop, ToTensor
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
IMAGE_PROCESSOR_MAPPING,
|
||||
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
|
||||
AutoConfig,
|
||||
AutoImageProcessor,
|
||||
AutoModelForMaskedImageModeling,
|
||||
SchedulerType,
|
||||
get_scheduler,
|
||||
)
|
||||
from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
""" Pre-training a 🤗 Transformers model for simple masked image modeling (SimMIM)
|
||||
without using HuggingFace Trainer.
|
||||
Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Finetune a transformers model on a simple Masked Image Modeling task"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_name",
|
||||
type=str,
|
||||
default="cifar10",
|
||||
help="Name of a dataset from the datasets package",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_config_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The configuration name of the dataset to use (via the datasets library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_column_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The column name of the images in the files. If not set, will try to use 'image' or 'img'.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="A folder containing the training data.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--validation_dir",
|
||||
type=None,
|
||||
default=None,
|
||||
help="A folder containing the validation data.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_val_split",
|
||||
type=float,
|
||||
default=0.15,
|
||||
help="Percent to split off of train for validation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mask_patch_size",
|
||||
type=int,
|
||||
default=32,
|
||||
help="The size of the square patches to use for masking.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mask_ratio",
|
||||
type=float,
|
||||
default=0.6,
|
||||
help="Percentage of patches to mask.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_train_samples",
|
||||
type=int,
|
||||
default=None,
|
||||
help=(
|
||||
"For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_eval_samples",
|
||||
type=int,
|
||||
default=None,
|
||||
help=(
|
||||
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
||||
"value if set."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The model checkpoint for weights initialization. Can be a local path to a pytorch_model.bin or a "
|
||||
"checkpoint identifier on the hub. "
|
||||
"Don't set if you want to train a model from scratch."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_name_or_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Pretrained config name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_overrides",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"Override some existing default config settings when a model is trained from scratch. Example: "
|
||||
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Where do you want to store (cache) the pretrained models/datasets downloaded from the hub",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_revision",
|
||||
type=str,
|
||||
default="main",
|
||||
help="The specific model version to use (can be a branch name, tag name or commit id).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_processor_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Name or path of preprocessor config.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_auth_token",
|
||||
type=bool,
|
||||
default=False,
|
||||
help=(
|
||||
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
|
||||
"with private models)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="The size (resolution) of each image. If not specified, will use `image_size` of the configuration.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--patch_size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="The size (resolution) of each patch. If not specified, will use `patch_size` of the configuration.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--encoder_stride",
|
||||
type=int,
|
||||
default=None,
|
||||
help={"help": "Stride to use for the encoder."},
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the model to the Hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_tracking",
|
||||
action="store_true",
|
||||
help="Whether to enable experiment trackers for logging.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report_to",
|
||||
type=str,
|
||||
default="all",
|
||||
help=(
|
||||
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
|
||||
' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
|
||||
"Only applicable when `--with_tracking` is passed."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="A seed for reproducible training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Batch size (per device) for the training dataloader.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=5e-5,
|
||||
help="The initial learning rate for [`AdamW`] optimizer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--weight_decay",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Weight decay to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_train_epochs",
|
||||
type=float,
|
||||
default=3.0,
|
||||
help="Total number of training epochs to perform (if not an integer, will perform the decimal part percents of the last epoch before stopping training).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_train_steps",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_scheduler_type",
|
||||
type=SchedulerType,
|
||||
default="linear",
|
||||
help="The scheduler type to use.",
|
||||
choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_warmup_steps",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of steps for the warmup in the lr scheduler.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--checkpointing_steps",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume_from_checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If the training should continue from a checkpoint folder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_eval_batch_size",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Batch size (per device) for the evaluation dataloader.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Where to store the final model.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Sanity checks
|
||||
data_files = {}
|
||||
if args.train_dir is not None:
|
||||
data_files["train"] = args.train_dir
|
||||
if args.validation_dir is not None:
|
||||
data_files["val"] = args.validation_dir
|
||||
args.data_files = data_files if data_files else None
|
||||
|
||||
if args.push_to_hub:
|
||||
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
|
||||
|
||||
return args
|
||||
|
||||
|
||||
class MaskGenerator:
|
||||
"""
|
||||
A class to generate boolean masks for the pretraining task.
|
||||
|
||||
A mask is a 1D tensor of shape (model_patch_size**2,) where the value is either 0 or 1,
|
||||
where 1 indicates "masked".
|
||||
"""
|
||||
|
||||
def __init__(self, input_size=192, mask_patch_size=32, model_patch_size=4, mask_ratio=0.6):
|
||||
self.input_size = input_size
|
||||
self.mask_patch_size = mask_patch_size
|
||||
self.model_patch_size = model_patch_size
|
||||
self.mask_ratio = mask_ratio
|
||||
|
||||
if self.input_size % self.mask_patch_size != 0:
|
||||
raise ValueError("Input size must be divisible by mask patch size")
|
||||
if self.mask_patch_size % self.model_patch_size != 0:
|
||||
raise ValueError("Mask patch size must be divisible by model patch size")
|
||||
|
||||
self.rand_size = self.input_size // self.mask_patch_size
|
||||
self.scale = self.mask_patch_size // self.model_patch_size
|
||||
|
||||
self.token_count = self.rand_size**2
|
||||
self.mask_count = int(np.ceil(self.token_count * self.mask_ratio))
|
||||
|
||||
def __call__(self):
|
||||
mask_idx = np.random.permutation(self.token_count)[: self.mask_count]
|
||||
mask = np.zeros(self.token_count, dtype=int)
|
||||
mask[mask_idx] = 1
|
||||
|
||||
mask = mask.reshape((self.rand_size, self.rand_size))
|
||||
mask = mask.repeat(self.scale, axis=0).repeat(self.scale, axis=1)
|
||||
|
||||
return torch.tensor(mask.flatten())
|
||||
|
||||
|
||||
def collate_fn(examples):
|
||||
pixel_values = torch.stack([example["pixel_values"] for example in examples])
|
||||
mask = torch.stack([example["mask"] for example in examples])
|
||||
return {"pixel_values": pixel_values, "bool_masked_pos": mask}
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
|
||||
# information sent is the one passed as arguments along with your Python/PyTorch versions.
|
||||
send_example_telemetry("run_mim_no_trainer", args)
|
||||
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
||||
**accelerator_log_kwargs,
|
||||
)
|
||||
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
logger.info(accelerator.state)
|
||||
if accelerator.is_local_main_process:
|
||||
datasets.utils.logging.set_verbosity_warning()
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
else:
|
||||
datasets.utils.logging.set_verbosity_error()
|
||||
transformers.utils.logging.set_verbosity_error()
|
||||
|
||||
# If passed along, set the training seed now.
|
||||
if args.seed is not None:
|
||||
set_seed(args.seed)
|
||||
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
# Initialize our dataset.
|
||||
ds = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
data_files=args.data_files,
|
||||
cache_dir=args.cache_dir,
|
||||
use_auth_token=True if args.use_auth_token else None,
|
||||
)
|
||||
|
||||
# If we don't have a validation split, split off a percentage of train as validation.
|
||||
args.train_val_split = None if "validation" in ds.keys() else args.train_val_split
|
||||
if isinstance(args.train_val_split, float) and args.train_val_split > 0.0:
|
||||
split = ds["train"].train_test_split(args.train_val_split)
|
||||
ds["train"] = split["train"]
|
||||
ds["validation"] = split["test"]
|
||||
|
||||
# Create config
|
||||
# Distributed training:
|
||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||
# download model & vocab.
|
||||
config_kwargs = {
|
||||
"cache_dir": args.cache_dir,
|
||||
"revision": args.model_revision,
|
||||
"use_auth_token": True if args.use_auth_token else None,
|
||||
}
|
||||
if args.config_name_or_path:
|
||||
config = AutoConfig.from_pretrained(args.config_name_or_path, **config_kwargs)
|
||||
elif args.model_name_or_path:
|
||||
config = AutoConfig.from_pretrained(args.model_name_or_path, **config_kwargs)
|
||||
else:
|
||||
config = CONFIG_MAPPING[args.model_type]()
|
||||
logger.warning("You are instantiating a new config instance from scratch.")
|
||||
if args.config_overrides is not None:
|
||||
logger.info(f"Overriding config: {args.config_overrides}")
|
||||
config.update_from_string(args.config_overrides)
|
||||
logger.info(f"New config: {config}")
|
||||
|
||||
# make sure the decoder_type is "simmim" (only relevant for BEiT)
|
||||
if hasattr(config, "decoder_type"):
|
||||
config.decoder_type = "simmim"
|
||||
|
||||
# adapt config
|
||||
args.image_size = args.image_size if args.image_size is not None else config.image_size
|
||||
args.patch_size = args.patch_size if args.patch_size is not None else config.patch_size
|
||||
args.encoder_stride = args.encoder_stride if args.encoder_stride is not None else config.encoder_stride
|
||||
|
||||
config.update(
|
||||
{
|
||||
"image_size": args.image_size,
|
||||
"patch_size": args.patch_size,
|
||||
"encoder_stride": args.encoder_stride,
|
||||
}
|
||||
)
|
||||
|
||||
# create image processor
|
||||
if args.image_processor_name:
|
||||
image_processor = AutoImageProcessor.from_pretrained(args.image_processor_name, **config_kwargs)
|
||||
elif args.model_name_or_path:
|
||||
image_processor = AutoImageProcessor.from_pretrained(args.model_name_or_path, **config_kwargs)
|
||||
else:
|
||||
IMAGE_PROCESSOR_TYPES = {
|
||||
conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items()
|
||||
}
|
||||
image_processor = IMAGE_PROCESSOR_TYPES[args.model_type]()
|
||||
|
||||
# create model
|
||||
if args.model_name_or_path:
|
||||
model = AutoModelForMaskedImageModeling.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir,
|
||||
revision=args.model_revision,
|
||||
use_auth_token=True if args.use_auth_token else None,
|
||||
)
|
||||
else:
|
||||
logger.info("Training new model from scratch")
|
||||
model = AutoModelForMaskedImageModeling.from_config(config)
|
||||
|
||||
column_names = ds["train"].column_names
|
||||
|
||||
if args.image_column_name is not None:
|
||||
image_column_name = args.image_column_name
|
||||
elif "image" in column_names:
|
||||
image_column_name = "image"
|
||||
elif "img" in column_names:
|
||||
image_column_name = "img"
|
||||
else:
|
||||
image_column_name = column_names[0]
|
||||
|
||||
# transformations as done in original SimMIM paper
|
||||
# source: https://github.com/microsoft/SimMIM/blob/main/data/data_simmim.py
|
||||
transforms = Compose(
|
||||
[
|
||||
Lambda(lambda img: img.convert("RGB")),
|
||||
RandomResizedCrop(args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)),
|
||||
RandomHorizontalFlip(),
|
||||
ToTensor(),
|
||||
Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
|
||||
]
|
||||
)
|
||||
|
||||
# create mask generator
|
||||
mask_generator = MaskGenerator(
|
||||
input_size=args.image_size,
|
||||
mask_patch_size=args.mask_patch_size,
|
||||
model_patch_size=args.patch_size,
|
||||
mask_ratio=args.mask_ratio,
|
||||
)
|
||||
|
||||
def preprocess_images(examples):
|
||||
"""Preprocess a batch of images by applying transforms + creating a corresponding mask, indicating
|
||||
which patches to mask."""
|
||||
|
||||
examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]]
|
||||
examples["mask"] = [mask_generator() for i in range(len(examples[image_column_name]))]
|
||||
|
||||
return examples
|
||||
|
||||
if args.max_train_samples is not None:
|
||||
ds["train"] = ds["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
|
||||
# Set the training transforms
|
||||
ds["train"].set_transform(preprocess_images)
|
||||
|
||||
if args.max_eval_samples is not None:
|
||||
ds["validation"] = ds["validation"].shuffle(seed=args.seed).select(range(args.max_eval_samples))
|
||||
# Set the validation transforms
|
||||
ds["validation"].set_transform(preprocess_images)
|
||||
|
||||
# DataLoaders creation:
|
||||
train_dataloader = DataLoader(
|
||||
ds["train"],
|
||||
shuffle=True,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=args.per_device_train_batch_size,
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
ds["validation"],
|
||||
collate_fn=collate_fn,
|
||||
batch_size=args.per_device_eval_batch_size,
|
||||
)
|
||||
|
||||
# Optimizer
|
||||
# Split weights in two groups, one with weight decay and the other not.
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||
|
||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
||||
# shorter in multiprocess)
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
overrode_max_train_steps = False
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||
if args.max_train_steps is None:
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
overrode_max_train_steps = True
|
||||
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model,
|
||||
optimizer,
|
||||
train_dataloader,
|
||||
eval_dataloader,
|
||||
lr_scheduler,
|
||||
)
|
||||
|
||||
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
model.tie_weights()
|
||||
|
||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||
if overrode_max_train_steps:
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
# Afterwards we recalculate our number of training epochs
|
||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||
|
||||
# Figure out how many steps we should save the Accelerator states
|
||||
checkpointing_steps = args.checkpointing_steps
|
||||
if checkpointing_steps is not None and checkpointing_steps.isdigit():
|
||||
checkpointing_steps = int(checkpointing_steps)
|
||||
|
||||
# We need to initialize the trackers we use, and also store our configuration.
|
||||
# The trackers initializes automatically on the main process.
|
||||
if args.with_tracking:
|
||||
experiment_config = vars(args)
|
||||
# TensorBoard cannot log Enums, need the raw value
|
||||
experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
|
||||
accelerator.init_trackers("mim_no_trainer", experiment_config)
|
||||
|
||||
# Train!
|
||||
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(ds['train'])}")
|
||||
logger.info(f" Num Epochs = {args.num_train_epochs}")
|
||||
logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}")
|
||||
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
|
||||
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
|
||||
logger.info(f" Total optimization steps = {args.max_train_steps}")
|
||||
# Only show the progress bar once on each machine.
|
||||
progress_bar = tqdm(range(int(args.max_train_steps)), disable=not accelerator.is_local_main_process)
|
||||
completed_steps = 0
|
||||
starting_epoch = 0
|
||||
|
||||
# Potentially load in the weights and states from a previous save
|
||||
if args.resume_from_checkpoint:
|
||||
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||
accelerator.load_state(args.resume_from_checkpoint)
|
||||
path = os.path.basename(args.resume_from_checkpoint)
|
||||
else:
|
||||
# Get the most recent checkpoint
|
||||
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||
dirs.sort(key=os.path.getctime)
|
||||
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||
# Extract `epoch_{i}` or `step_{i}`
|
||||
training_difference = os.path.splitext(path)[0]
|
||||
|
||||
if "epoch" in training_difference:
|
||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||
resume_step = None
|
||||
else:
|
||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
# update the progress_bar if load from checkpoint
|
||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||
|
||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
total_loss = 0
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
if step % args.gradient_accumulation_steps == 0:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
if isinstance(checkpointing_steps, int):
|
||||
if completed_steps % checkpointing_steps == 0:
|
||||
output_dir = f"step_{completed_steps }"
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
if completed_steps >= args.max_train_steps:
|
||||
break
|
||||
|
||||
model.eval()
|
||||
losses = []
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
|
||||
loss = outputs.loss
|
||||
losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
|
||||
|
||||
losses = torch.cat(losses)
|
||||
eval_loss = torch.mean(losses)
|
||||
|
||||
logger.info(f"epoch {epoch}: eval_loss: {eval_loss}")
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"eval_loss": eval_loss,
|
||||
"train_loss": total_loss.item() / len(train_dataloader),
|
||||
"epoch": epoch,
|
||||
"step": completed_steps,
|
||||
},
|
||||
step=completed_steps,
|
||||
)
|
||||
|
||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||
accelerator.wait_for_everyone()
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.save_pretrained(
|
||||
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
||||
)
|
||||
if accelerator.is_main_process:
|
||||
image_processor.save_pretrained(args.output_dir)
|
||||
repo.push_to_hub(
|
||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||
)
|
||||
|
||||
if args.checkpointing_steps == "epoch":
|
||||
output_dir = f"epoch_{epoch}"
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator.end_training()
|
||||
|
||||
if args.output_dir is not None:
|
||||
accelerator.wait_for_everyone()
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.save_pretrained(
|
||||
args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
|
||||
)
|
||||
if accelerator.is_main_process:
|
||||
image_processor.save_pretrained(args.output_dir)
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, get_full_repo
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
# You should update this to your particular problem to have better documentation of `model_type`
|
||||
|
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
@ -590,12 +590,12 @@ def main():
|
||||
# Format the result to the format the metric expects.
|
||||
if data_args.version_2_with_negative:
|
||||
formatted_predictions = [
|
||||
{"id": str(k), "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
|
||||
{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
|
||||
]
|
||||
else:
|
||||
formatted_predictions = [{"id": str(k), "prediction_text": v} for k, v in predictions.items()]
|
||||
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
|
||||
|
||||
references = [{"id": str(ex["id"]), "answers": ex[answer_column_name]} for ex in examples]
|
||||
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
|
||||
return EvalPrediction(predictions=formatted_predictions, label_ids=references)
|
||||
|
||||
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
|
||||
|
||||
|
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@ -682,9 +682,7 @@ def main():
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
||||
|
||||
all_results = {
|
||||
f"eval_{k}": v.tolist() if isinstance(v, np.ndarray) else v for k, v in eval_metrics.items()
|
||||
}
|
||||
all_results = {f"eval_{k}": v for k, v in eval_metrics.items()}
|
||||
with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
|
||||
json.dump(all_results, f)
|
||||
|
||||
|
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
@ -705,7 +705,7 @@ def main():
|
||||
compute_metrics=compute_metrics,
|
||||
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
|
||||
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
|
||||
tokenizer=processor,
|
||||
tokenizer=feature_extractor,
|
||||
)
|
||||
|
||||
# 8. Finally, we can start training
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
@ -21,7 +21,6 @@ import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
import torch
|
||||
@ -177,7 +176,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))
|
||||
|
||||
@unittest.skip(reason="Fix me @zack")
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
|
||||
def test_run_squad_no_trainer(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
@ -486,8 +486,6 @@ def main():
|
||||
# Get the metric function
|
||||
if data_args.task_name is not None:
|
||||
metric = evaluate.load("glue", data_args.task_name)
|
||||
elif is_regression:
|
||||
metric = evaluate.load("mse")
|
||||
else:
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
@ -496,10 +494,15 @@ def main():
|
||||
def compute_metrics(p: EvalPrediction):
|
||||
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
|
||||
preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
|
||||
result = metric.compute(predictions=preds, references=p.label_ids)
|
||||
if len(result) > 1:
|
||||
result["combined_score"] = np.mean(list(result.values())).item()
|
||||
return result
|
||||
if data_args.task_name is not None:
|
||||
result = metric.compute(predictions=preds, references=p.label_ids)
|
||||
if len(result) > 1:
|
||||
result["combined_score"] = np.mean(list(result.values())).item()
|
||||
return result
|
||||
elif is_regression:
|
||||
return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
|
||||
else:
|
||||
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
|
||||
|
||||
# Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
|
||||
# we already did the padding.
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.30.0.dev0")
|
||||
check_min_version("4.29.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user