mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-24 03:24:37 +08:00
Compare commits
353 Commits
amdgpu-mul
...
v4.35.0
| Author | SHA1 | Date | |
|---|---|---|---|
| f1185a4a73 | |||
| b6c0c2b906 | |||
| fba863b19d | |||
| 441c3e0dd2 | |||
| 8801861d2d | |||
| 443bf5e9e2 | |||
| 4557a0dede | |||
| 8a312956fd | |||
| 9b25c164bd | |||
| c52e429b1c | |||
| 7adaefe2bc | |||
| af3de8d87c | |||
| 3520e37e86 | |||
| 95020f208e | |||
| c9e72f55b2 | |||
| 239cd0eaa2 | |||
| 1e32b05e06 | |||
| 21a2fbaf48 | |||
| f8afb2b2ec | |||
| 391d14e810 | |||
| f9b4bea0a6 | |||
| 037fb7d0e1 | |||
| f3c1a172bb | |||
| 636f704d0b | |||
| 71025520bc | |||
| ae093eef01 | |||
| 82c7e87987 | |||
| 7d8ff3629b | |||
| 113ebf80ac | |||
| 25e6e9418c | |||
| 50378cbf6c | |||
| 77930f8a01 | |||
| 6b7f8ff1f3 | |||
| e22b7ced9a | |||
| 4bb50aa212 | |||
| 05f2290114 | |||
| 309a90664f | |||
| f53041a753 | |||
| 08fadc8085 | |||
| a8e74ebdc5 | |||
| 2963e196ee | |||
| 3cd3eaf960 | |||
| b5db8ca66f | |||
| 9dc4ce9ea7 | |||
| 14bb196cc8 | |||
| 9234caefb0 | |||
| b5c8e23f0f | |||
| df6f36a171 | |||
| 8211c59b9a | |||
| d39352d12c | |||
| e971486d89 | |||
| f7ea959b96 | |||
| 5bbf671276 | |||
| 84724efd10 | |||
| 9093b19b13 | |||
| 3224c0c13f | |||
| cd19b19378 | |||
| 6b466771b0 | |||
| 576994963f | |||
| 691fd8fdde | |||
| d751dbecb2 | |||
| 5fbed2d7ca | |||
| e830495c1c | |||
| 160432110c | |||
| 211ad4c9cc | |||
| 722e936491 | |||
| 9e87618f2b | |||
| ef23b68ebf | |||
| 96f9e78f4c | |||
| ac5893756b | |||
| 29c74f58ae | |||
| ffff9e70ab | |||
| 5be1fb6d1f | |||
| 66b088faf0 | |||
| e2bffcfafd | |||
| 90ee9cea19 | |||
| aa4198a238 | |||
| 6f31601687 | |||
| 34a640642b | |||
| 1892592530 | |||
| 8214d6e7b1 | |||
| d7cb5e138e | |||
| 4864d08d3e | |||
| 15cd096288 | |||
| fe2877ce21 | |||
| efba1a1744 | |||
| 90412401e6 | |||
| 3c2692407d | |||
| 9c5240af14 | |||
| df2eebf1e7 | |||
| a2f55a65cd | |||
| ba5144f7a9 | |||
| c34c50cdc0 | |||
| ba073ea9e3 | |||
| a64f8c1f87 | |||
| 0baa9246cb | |||
| 06e782da4e | |||
| 9286f0ac39 | |||
| 6cbc1369a3 | |||
| a0fd34483f | |||
| 9333bf0769 | |||
| 13ef14e18e | |||
| 9da451713d | |||
| 41496b95da | |||
| b18e31407c | |||
| cb0c68069d | |||
| 7bde5d634f | |||
| e2d6d5ce57 | |||
| 576e2823a3 | |||
| fc142bd775 | |||
| cc7803c0a6 | |||
| ede051f1b8 | |||
| 32f799db0d | |||
| 25c022d7c5 | |||
| f370bebdc3 | |||
| b0d1d7f71a | |||
| 19ae0505ae | |||
| 33f98cfded | |||
| f09a081d27 | |||
| f7354a3bd6 | |||
| c0b5ad9473 | |||
| f9f27b0fc2 | |||
| 244a53e0f6 | |||
| cb45f71c4d | |||
| 50d0cf4f6b | |||
| d33d313192 | |||
| ef978d0a7b | |||
| 45425660d0 | |||
| 700329493d | |||
| f71c9ccf59 | |||
| 093848d3cc | |||
| 224794b011 | |||
| c030fc8913 | |||
| 9b1976697d | |||
| 929134bf65 | |||
| 08a2edfc66 | |||
| ae4fb84629 | |||
| bc4bbd9f6e | |||
| cbd278f0f6 | |||
| 73dc23f786 | |||
| ad08137e47 | |||
| bdbcd5d482 | |||
| 734dd96e02 | |||
| 816c2237c1 | |||
| 574a538455 | |||
| caa0ff0bf1 | |||
| 5a73316bed | |||
| 732d2a8aac | |||
| eec5a3a8d8 | |||
| d933818d67 | |||
| de55ead1f1 | |||
| ef7e93699a | |||
| 34678db4a1 | |||
| 280c757f6c | |||
| bece55d8f9 | |||
| 6d644d6852 | |||
| e893b1efbb | |||
| ef42cb6274 | |||
| b002353dca | |||
| 46092f763d | |||
| 51042ae8e5 | |||
| db611aabee | |||
| 41c42f85f6 | |||
| 4b423e6074 | |||
| 0b8604d002 | |||
| 85e9d64480 | |||
| b3961f7291 | |||
| b8f1cde931 | |||
| fd6a0ade9b | |||
| 14b04b4b9c | |||
| 5c6b83cb69 | |||
| 12cc123359 | |||
| 3ef7134553 | |||
| 805d5d2111 | |||
| 570b3f9cdd | |||
| b91cff5a3e | |||
| a5f5568d75 | |||
| 5d997f227c | |||
| 5c081e2993 | |||
| 69a26c7ecd | |||
| 0e52af4d7b | |||
| 0dd58d96a0 | |||
| 21dc585942 | |||
| d6e5b02ef3 | |||
| 7cc6f822a3 | |||
| 8e05ad326b | |||
| 762af3e3c7 | |||
| bdb391e9c6 | |||
| c9785d956b | |||
| 6df9179c1c | |||
| 5bfda28dd3 | |||
| 288bf5c1d2 | |||
| d085662c59 | |||
| 21da3b2461 | |||
| 7790943c91 | |||
| 3e93dd295b | |||
| 883ed4b344 | |||
| a243cdca2a | |||
| 33df09e71a | |||
| b4199c2dad | |||
| eb734e5147 | |||
| 0ebee8b933 | |||
| 57632bf98c | |||
| db5e0c3292 | |||
| 72256bc72a | |||
| ab0ddc99e8 | |||
| 40ea9ab2a1 | |||
| 3bc65505fc | |||
| e1cec43415 | |||
| 9b7668c03a | |||
| 797a1babf2 | |||
| aaccf1844e | |||
| e58cbed51d | |||
| b219ae6bd4 | |||
| 1d6a84749b | |||
| 6ecb2ab679 | |||
| 69873d529d | |||
| cc44ca8017 | |||
| da69de17e8 | |||
| 5334796d20 | |||
| 9f40639292 | |||
| dcc49d8a7e | |||
| 1e3c9ddacc | |||
| fc63914399 | |||
| 3eceaa3637 | |||
| 975003eacb | |||
| e8fdd7875d | |||
| a9862a0f49 | |||
| 592f2eabd1 | |||
| a5e6df82c0 | |||
| 87b4ade9e5 | |||
| 3257946fb7 | |||
| d2f06dfffc | |||
| 3763101f85 | |||
| c7f01beece | |||
| 740fc6a1da | |||
| 8835bff6a0 | |||
| 86a4e5a96b | |||
| 2629c8f36a | |||
| 897a826d83 | |||
| 360ea8fc72 | |||
| 9ad815e412 | |||
| 27597fea07 | |||
| e840aa67e8 | |||
| 87499420bf | |||
| ea52ed9dc8 | |||
| 64845307b3 | |||
| 65aabafe2f | |||
| af38c837ee | |||
| 8878eb1bd9 | |||
| 75a33d60f2 | |||
| 18fbeec824 | |||
| 9d20601259 | |||
| 9e78c9acfb | |||
| 0a3b9d02fe | |||
| e6d250e4cd | |||
| 19f0b7dd02 | |||
| 54e17a15dc | |||
| 2ab76c2c4f | |||
| 253f9a3f97 | |||
| b4e66d7a67 | |||
| 43bfd093e1 | |||
| 2d8ee9817c | |||
| f9ab07f920 | |||
| c037b2e340 | |||
| ca7912d191 | |||
| 8b03615b7b | |||
| 9deb18ca1a | |||
| 0a49f909bc | |||
| 6015f91a5a | |||
| 8b46c5bcfc | |||
| 03af4c42a6 | |||
| 122b2657f8 | |||
| 4fdf47cd3c | |||
| fc296f419e | |||
| 2f3ea08a07 | |||
| 5c66378cea | |||
| 2c7b26f508 | |||
| 57f44dc428 | |||
| bd6205919a | |||
| c26b2a29e5 | |||
| 2aef9a9601 | |||
| ae9a344cce | |||
| 1a2e966cfe | |||
| 245da7ed38 | |||
| 3632fb3c25 | |||
| 768aa3d9cd | |||
| b5ca8fcd20 | |||
| df6a855e7b | |||
| cf345d5f38 | |||
| 6de6fdd06d | |||
| e092b4ad68 | |||
| 9ed538f2e6 | |||
| 1470f731b6 | |||
| c20d90d577 | |||
| bab3331906 | |||
| 4b4c6aabfb | |||
| e4dad4fe32 | |||
| 1b8decb04c | |||
| 63864e057f | |||
| 6824461f2a | |||
| 24178c2461 | |||
| 7d6627d0d9 | |||
| 6d02ca4bb9 | |||
| 7d77d7f79c | |||
| ca0379b8c8 | |||
| 67239f7360 | |||
| 0b192de1f3 | |||
| 68e85fc822 | |||
| 391177441b | |||
| 9b23d0de0e | |||
| 14170b784b | |||
| 7bb1c0c147 | |||
| 211f93aab9 | |||
| 4e931a8eb3 | |||
| 5e11d72d4d | |||
| 216dff7549 | |||
| 38e96324ef | |||
| 52e2c13da3 | |||
| 098c3f400c | |||
| ba47efbfe4 | |||
| 375b4e0935 | |||
| a7e0ed829c | |||
| ab37b801b1 | |||
| a0922a538b | |||
| ef81759e31 | |||
| 6ae71ec836 | |||
| 78dd120282 | |||
| 72958fcd3c | |||
| 3ca18d6d09 | |||
| 946bac798c | |||
| 153755ee38 | |||
| a0be960dcc | |||
| 777f2243f5 | |||
| abd2531034 | |||
| 408b2b3c50 | |||
| 6ba63ac3a0 | |||
| 0ac3875011 | |||
| 6ce6a5adb9 | |||
| a8531f3bfd | |||
| a09130feee | |||
| ace74d16bd | |||
| 5e09af2acd | |||
| 033ec57c03 | |||
| d9e4bc2895 | |||
| 546e7679e7 | |||
| 0ee4590684 | |||
| 6accd5effb | |||
| 5936c8c57c | |||
| 910faa3e1f | |||
| 576cd45a57 | |||
| 914771cbfe | |||
| 368a58e61c |
@ -209,6 +209,7 @@ jobs:
|
||||
- run: make deps_table_check_updated
|
||||
- run: python utils/update_metadata.py --check-only
|
||||
- run: python utils/check_task_guides.py
|
||||
- run: python utils/check_docstrings.py
|
||||
|
||||
workflows:
|
||||
version: 2
|
||||
|
||||
@ -127,6 +127,8 @@ class CircleCIJob:
|
||||
},
|
||||
]
|
||||
steps.extend([{"run": l} for l in self.install_steps])
|
||||
steps.extend([{"run": 'pip install "fsspec>=2023.5.0,<2023.10.0"'}])
|
||||
steps.extend([{"run": "pip install pytest-subtests"}])
|
||||
steps.append(
|
||||
{
|
||||
"save_cache": {
|
||||
@ -311,7 +313,7 @@ torch_job = CircleCIJob(
|
||||
"pip install -U --upgrade-strategy eager git+https://github.com/huggingface/accelerate",
|
||||
],
|
||||
parallelism=1,
|
||||
pytest_num_workers=8,
|
||||
pytest_num_workers=6,
|
||||
)
|
||||
|
||||
|
||||
@ -347,6 +349,7 @@ pipelines_torch_job = CircleCIJob(
|
||||
"pip install -U --upgrade-strategy eager .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video]",
|
||||
],
|
||||
marker="is_pipeline_test",
|
||||
pytest_num_workers=6,
|
||||
)
|
||||
|
||||
|
||||
@ -466,13 +469,15 @@ exotic_models_job = CircleCIJob(
|
||||
"sudo apt install tesseract-ocr",
|
||||
"pip install -U --upgrade-strategy eager pytesseract",
|
||||
"pip install -U --upgrade-strategy eager natten",
|
||||
# TODO (ydshieh): Remove this line once `https://github.com/facebookresearch/detectron2/issues/5010` is resolved
|
||||
'pip install -U --upgrade-strategy eager "Pillow<10.0.0"',
|
||||
"pip install -U --upgrade-strategy eager python-Levenshtein",
|
||||
"pip install -U --upgrade-strategy eager opencv-python",
|
||||
"pip install -U --upgrade-strategy eager nltk",
|
||||
],
|
||||
tests_to_run=[
|
||||
"tests/models/*layoutlmv*",
|
||||
"tests/models/*nat",
|
||||
"tests/models/deta",
|
||||
"tests/models/nougat",
|
||||
],
|
||||
pytest_num_workers=1,
|
||||
pytest_options={"durations": 100},
|
||||
|
||||
4
.github/conda/meta.yaml
vendored
4
.github/conda/meta.yaml
vendored
@ -26,6 +26,8 @@ requirements:
|
||||
- protobuf
|
||||
- tokenizers >=0.11.1,!=0.11.3,<0.13
|
||||
- pyyaml >=5.1
|
||||
- safetensors
|
||||
- fsspec
|
||||
run:
|
||||
- python
|
||||
- numpy >=1.17
|
||||
@ -40,6 +42,8 @@ requirements:
|
||||
- protobuf
|
||||
- tokenizers >=0.11.1,!=0.11.3,<0.13
|
||||
- pyyaml >=5.1
|
||||
- safetensors
|
||||
- fsspec
|
||||
|
||||
test:
|
||||
imports:
|
||||
|
||||
2
.github/workflows/build_documentation.yml
vendored
2
.github/workflows/build_documentation.yml
vendored
@ -15,7 +15,7 @@ jobs:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: transformers
|
||||
notebook_folder: transformers_doc
|
||||
languages: de en es fr it ko pt zh
|
||||
languages: de en es fr hi it ko pt zh ja te
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
|
||||
|
||||
2
.github/workflows/build_pr_documentation.yml
vendored
2
.github/workflows/build_pr_documentation.yml
vendored
@ -14,4 +14,4 @@ jobs:
|
||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||
pr_number: ${{ github.event.number }}
|
||||
package: transformers
|
||||
languages: de en es fr it ko pt zh
|
||||
languages: de en es fr hi it ko pt zh ja te
|
||||
|
||||
2
.github/workflows/doctests.yml
vendored
2
.github/workflows/doctests.yml
vendored
@ -20,7 +20,7 @@ env:
|
||||
|
||||
jobs:
|
||||
run_doctests:
|
||||
runs-on: [self-hosted, doc-tests-gpu]
|
||||
runs-on: [single-gpu, nvidia-gpu, t4, doctest-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
42
.github/workflows/self-nightly-scheduled.yml
vendored
42
.github/workflows/self-nightly-scheduled.yml
vendored
@ -21,40 +21,12 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -94,7 +66,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -155,7 +127,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -215,7 +187,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
|
||||
@ -276,8 +248,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@ -288,8 +258,6 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
@ -303,8 +271,6 @@ jobs:
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: Nightly CI
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
|
||||
42
.github/workflows/self-past.yml
vendored
42
.github/workflows/self-past.yml
vendored
@ -32,40 +32,12 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -101,7 +73,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -177,7 +149,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -253,7 +225,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
@ -319,8 +291,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@ -331,8 +301,6 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
@ -351,8 +319,6 @@ jobs:
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
|
||||
25
.github/workflows/self-push-amd-mi210-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi210-caller.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
name: Self-hosted runner (AMD mi210 CI caller)
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Self-hosted runner (push-caller)"]
|
||||
branches: ["main"]
|
||||
types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- run_amd_push_ci_caller*
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi210
|
||||
if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
uses: ./.github/workflows/self-push-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi210
|
||||
secrets: inherit
|
||||
25
.github/workflows/self-push-amd-mi250-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi250-caller.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
name: Self-hosted runner (AMD mi250 CI caller)
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Self-hosted runner (push-caller)"]
|
||||
branches: ["main"]
|
||||
types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- run_amd_push_ci_caller*
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi250
|
||||
if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
uses: ./.github/workflows/self-push-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi250
|
||||
secrets: inherit
|
||||
60
.github/workflows/self-push-amd.yml
vendored
60
.github/workflows/self-push-amd.yml
vendored
@ -1,21 +1,11 @@
|
||||
name: Self-hosted runner AMD GPU (push)
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Self-hosted runner (push-caller)"]
|
||||
branches: ["main"]
|
||||
types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- ci_*
|
||||
- ci-*
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
repository_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
gpu_flavor:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
@ -44,28 +34,30 @@ jobs:
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
rocminfo | grep "Agent" -A 14
|
||||
- name: Show HIP environment
|
||||
run: |
|
||||
echo "HIP: $HIP_VISIBLE_DEVICES"
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
setup_gpu:
|
||||
name: Setup
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
@ -150,7 +142,7 @@ jobs:
|
||||
echo "matrix=$keys" >> $GITHUB_OUTPUT
|
||||
echo "test_map=$test_map" >> $GITHUB_OUTPUT
|
||||
|
||||
run_tests_single_gpu:
|
||||
run_tests_amdgpu:
|
||||
name: Model tests
|
||||
needs: setup_gpu
|
||||
# `dummy` means there is no test to run
|
||||
@ -159,12 +151,11 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
@ -216,7 +207,11 @@ jobs:
|
||||
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
rocminfo | grep "Agent" -A 14
|
||||
- name: Show HIP environment
|
||||
run: |
|
||||
echo "HIP: $HIP_VISIBLE_DEVICES"
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
@ -252,8 +247,7 @@ jobs:
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup_gpu,
|
||||
run_tests_single_gpu,
|
||||
# run_tests_multi_gpu,
|
||||
run_tests_amdgpu,
|
||||
# run_tests_torch_cuda_extensions_single_gpu,
|
||||
# run_tests_torch_cuda_extensions_multi_gpu
|
||||
]
|
||||
@ -314,7 +308,7 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: push
|
||||
CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
|
||||
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
||||
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
||||
CI_SHA: ${{ env.CI_SHA }}
|
||||
|
||||
44
.github/workflows/self-push.yml
vendored
44
.github/workflows/self-push.yml
vendored
@ -27,40 +27,12 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -158,7 +130,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -251,7 +223,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -344,7 +316,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -434,7 +406,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -521,8 +493,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@ -534,9 +504,7 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
@ -589,8 +557,6 @@ jobs:
|
||||
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
||||
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
||||
CI_SHA: ${{ env.CI_SHA }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
|
||||
50
.github/workflows/self-scheduled.yml
vendored
50
.github/workflows/self-scheduled.yml
vendored
@ -25,40 +25,12 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -98,7 +70,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -159,7 +131,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -219,7 +191,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -270,7 +242,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -320,7 +292,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-tensorflow-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -371,7 +343,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
@ -430,8 +402,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@ -480,8 +450,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@ -496,8 +464,6 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
@ -513,8 +479,6 @@ jobs:
|
||||
CI_EVENT: scheduled
|
||||
CI_SHA: ${{ github.sha }}
|
||||
CI_WORKFLOW_REF: ${{ github.workflow_ref }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
|
||||
@ -40,8 +40,8 @@ There are several ways you can contribute to ЁЯдЧ Transformers:
|
||||
|
||||
If you don't know where to start, there is a special [Good First
|
||||
Issue](https://github.com/huggingface/transformers/contribute) listing. It will give you a list of
|
||||
open issues that are beginner-friendly and help you start contributing to open-source. Just comment in the issue that you'd like to work
|
||||
on it.
|
||||
open issues that are beginner-friendly and help you start contributing to open-source. Just comment on the issue that you'd like to work
|
||||
on.
|
||||
|
||||
For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! ЁЯЪА
|
||||
|
||||
@ -62,7 +62,7 @@ feedback.
|
||||
The ЁЯдЧ Transformers library is robust and reliable thanks to users who report the problems they encounter.
|
||||
|
||||
Before you report an issue, we would really appreciate it if you could **make sure the bug was not
|
||||
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask on the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.
|
||||
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.
|
||||
|
||||
Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
|
||||
|
||||
@ -105,7 +105,7 @@ We have added [templates](https://github.com/huggingface/transformers/tree/main/
|
||||
|
||||
New models are constantly released and if you want to implement a new model, please provide the following information
|
||||
|
||||
* A short description of the model and link to the paper.
|
||||
* A short description of the model and a link to the paper.
|
||||
* Link to the implementation if it is open-sourced.
|
||||
* Link to the model weights if they are available.
|
||||
|
||||
@ -172,7 +172,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
|
||||
|
||||
which should be enough for most use cases.
|
||||
|
||||
5. Develop the features on your branch.
|
||||
5. Develop the features in your branch.
|
||||
|
||||
As you work on your code, you should make sure the test suite
|
||||
passes. Run the tests impacted by your changes like this:
|
||||
@ -208,7 +208,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
|
||||
make quality
|
||||
```
|
||||
|
||||
Finally, we have a lot of scripts to make sure we didn't forget to update
|
||||
Finally, we have a lot of scripts to make sure we don't forget to update
|
||||
some files when adding a new model. You can run these scripts with:
|
||||
|
||||
```bash
|
||||
@ -218,7 +218,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
|
||||
To learn more about those checks and how to fix any issues with them, check out the
|
||||
[Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
|
||||
|
||||
If you're modifying documents under `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
|
||||
If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
|
||||
make sure you install the documentation builder:
|
||||
|
||||
```bash
|
||||
@ -234,7 +234,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
|
||||
This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
|
||||
Markdown files with your favorite editor. You can also preview the docs on GitHub when you open a pull request.
|
||||
|
||||
Once you're happy with your changes, add changed files with `git add` and
|
||||
Once you're happy with your changes, add the changed files with `git add` and
|
||||
record your changes locally with `git commit`:
|
||||
|
||||
```bash
|
||||
@ -261,7 +261,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
|
||||
|
||||
If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.
|
||||
|
||||
6. Now you can go to your fork of the repository on GitHub and click on **Pull request** to open a pull request. Make sure you tick off all the boxes in our [checklist](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
|
||||
6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
|
||||
|
||||
7. It's ok if maintainers request changes, it happens to our core contributors
|
||||
too! So everyone can see the changes in the pull request, work in your local
|
||||
|
||||
2
Makefile
2
Makefile
@ -43,6 +43,7 @@ repo-consistency:
|
||||
python utils/check_doctest_list.py
|
||||
python utils/update_metadata.py --check-only
|
||||
python utils/check_task_guides.py
|
||||
python utils/check_docstrings.py
|
||||
|
||||
# this target runs checks on all files
|
||||
|
||||
@ -82,6 +83,7 @@ fix-copies:
|
||||
python utils/check_dummies.py --fix_and_overwrite
|
||||
python utils/check_doctest_list.py --fix_and_overwrite
|
||||
python utils/check_task_guides.py --fix_and_overwrite
|
||||
python utils/check_docstrings.py --fix_and_overwrite
|
||||
|
||||
# Run tests for the library
|
||||
|
||||
|
||||
49
README.md
49
README.md
@ -51,8 +51,11 @@ limitations under the License.
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">эХЬъ╡ньЦ┤</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a>
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ru.md">╨а╤Г╤Б╤Б╨║╨╕╨╣</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_pt-br.md">╨аortugu├кs</a> |
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
@ -67,7 +70,7 @@ limitations under the License.
|
||||
|
||||
These models can be applied on:
|
||||
|
||||
* ЁЯУЭ Text, for tasks like text classification, information extraction, question answering, summarization, translation, text generation, in over 100 languages.
|
||||
* ЁЯУЭ Text, for tasks like text classification, information extraction, question answering, summarization, translation, and text generation, in over 100 languages.
|
||||
* ЁЯЦ╝я╕П Images, for tasks like image classification, object detection, and segmentation.
|
||||
* ЁЯЧгя╕П Audio, for tasks like speech recognition and audio classification.
|
||||
|
||||
@ -145,7 +148,7 @@ To immediately use a model on a given input (text, image, audio, ...), we provid
|
||||
[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
|
||||
```
|
||||
|
||||
The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here the answer is "positive" with a confidence of 99.97%.
|
||||
The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here, the answer is "positive" with a confidence of 99.97%.
|
||||
|
||||
Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image:
|
||||
|
||||
@ -179,7 +182,7 @@ Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in compute
|
||||
'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
|
||||
```
|
||||
|
||||
Here we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:
|
||||
Here, we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
|
||||
@ -210,7 +213,7 @@ And here is the equivalent code for TensorFlow:
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.
|
||||
The tokenizer is responsible for all the preprocessing the pretrained model expects and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.
|
||||
|
||||
The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.
|
||||
|
||||
@ -230,7 +233,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
|
||||
1. Choose the right framework for every part of a model's lifetime:
|
||||
- Train state-of-the-art models in 3 lines of code.
|
||||
- Move a single model between TF2.0/PyTorch/JAX frameworks at will.
|
||||
- Seamlessly pick the right framework for training, evaluation and production.
|
||||
- Seamlessly pick the right framework for training, evaluation, and production.
|
||||
|
||||
1. Easily customize a model or an example to your needs:
|
||||
- We provide examples for each architecture to reproduce the results published by its original authors.
|
||||
@ -241,19 +244,19 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
|
||||
|
||||
- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
|
||||
- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)).
|
||||
- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
|
||||
- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
|
||||
|
||||
## Installation
|
||||
|
||||
### With pip
|
||||
|
||||
This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ and TensorFlow 2.6+.
|
||||
This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, and TensorFlow 2.6+.
|
||||
|
||||
You should install ЁЯдЧ Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
|
||||
|
||||
First, create a virtual environment with the version of Python you're going to use and activate it.
|
||||
|
||||
Then, you will need to install at least one of Flax, PyTorch or TensorFlow.
|
||||
Then, you will need to install at least one of Flax, PyTorch, or TensorFlow.
|
||||
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific installation command for your platform.
|
||||
|
||||
When one of those backends has been installed, ЁЯдЧ Transformers can be installed using pip as follows:
|
||||
@ -280,7 +283,7 @@ Follow the installation pages of Flax, PyTorch or TensorFlow to see how to insta
|
||||
|
||||
## Model architectures
|
||||
|
||||
**[All the model checkpoints](https://huggingface.co/models)** provided by ЁЯдЧ Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
|
||||
**[All the model checkpoints](https://huggingface.co/models)** provided by ЁЯдЧ Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models), where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
|
||||
|
||||
Current number of checkpoints: 
|
||||
|
||||
@ -292,11 +295,11 @@ Current number of checkpoints: ** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
|
||||
1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
|
||||
1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
|
||||
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
||||
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
|
||||
1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from ├Йcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
|
||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
@ -361,13 +364,14 @@ Current number of checkpoints: ** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey ├Цhman, Fredrik Carlsson, Magnus Sahlgren.
|
||||
1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo Garc├нa del R├нo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
|
||||
@ -382,6 +386,7 @@ Current number of checkpoints: ** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
@ -390,7 +395,7 @@ Current number of checkpoints: ** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Herv├й J├йgou, Matthijs Douze.
|
||||
1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
|
||||
1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth├йe Lacroix, Baptiste Rozi├иre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
|
||||
1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
|
||||
1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
@ -408,6 +413,7 @@ Current number of checkpoints: ** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
@ -425,15 +431,17 @@ Current number of checkpoints: ** (from Huawei NoahтАЩs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
@ -453,6 +461,7 @@ Current number of checkpoints: ** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
@ -491,7 +500,7 @@ Current number of checkpoints: ** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
@ -513,7 +522,7 @@ Current number of checkpoints: ** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedback before starting your PR.
|
||||
|
||||
To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the ЁЯдЧ Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
|
||||
|
||||
|
||||
19
README_es.md
19
README_es.md
@ -18,7 +18,7 @@ limitations under the License.
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||
<br>
|
||||
<p>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
@ -46,8 +46,9 @@ limitations under the License.
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">эХЬъ╡ньЦ┤</a> |
|
||||
<b>Espa├▒ol</b> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a>
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a> |
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
@ -338,6 +339,7 @@ N├║mero actual de puntos de control: ** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
@ -359,6 +361,7 @@ N├║mero actual de puntos de control: ** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
@ -385,6 +388,7 @@ N├║mero actual de puntos de control: ** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed..
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
@ -402,15 +406,17 @@ N├║mero actual de puntos de control: ** (from Huawei NoahтАЩs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
@ -430,6 +436,7 @@ N├║mero actual de puntos de control: ** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
@ -468,7 +475,7 @@ N├║mero actual de puntos de control: ** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
|
||||
31
README_hd.md
31
README_hd.md
@ -43,7 +43,7 @@ checkpoint: рдЬрд╛рдБрдЪ рдмрд┐рдВрджреБ
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||
<br>
|
||||
<p>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
@ -72,7 +72,8 @@ checkpoint: рдЬрд╛рдБрдЪ рдмрд┐рдВрджреБ
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<b>рд╣рд┐рдиреНрджреА</b> |
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
@ -85,13 +86,13 @@ checkpoint: рдЬрд╛рдБрдЪ рдмрд┐рдВрджреБ
|
||||
|
||||
ЁЯдЧ Transformers 100 рд╕реЗ рдЕрдзрд┐рдХ рднрд╛рд╖рд╛рдУрдВ рдореЗрдВ рдкрд╛рда рд╡рд░реНрдЧреАрдХрд░рдг, рд╕реВрдЪрдирд╛ рдирд┐рд╖реНрдХрд░реНрд╖рдг, рдкреНрд░рд╢реНрди рдЙрддреНрддрд░, рд╕рд╛рд░рд╛рдВрд╢реАрдХрд░рдг, рдЕрдиреБрд╡рд╛рдж, рдкрд╛рда рдирд┐рд░реНрдорд╛рдг рдХрд╛ рд╕рдорд░реНрдерди рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП рд╣рдЬрд╛рд░реЛрдВ рдкреВрд░реНрд╡-рдкреНрд░рд╢рд┐рдХреНрд╖рд┐рдд рдореЙрдбрд▓ рдкреНрд░рджрд╛рди рдХрд░рддрд╛ рд╣реИред рдЗрд╕рдХрд╛ рдЙрджреНрджреЗрд╢реНрдп рд╕рдмрд╕реЗ рдЙрдиреНрдирдд рдПрдирдПрд▓рдкреА рддрдХрдиреАрдХ рдХреЛ рд╕рднреА рдХреЗ рд▓рд┐рдП рд╕реБрд▓рдн рдмрдирд╛рдирд╛ рд╣реИред
|
||||
|
||||
ЁЯдЧ Transformers рддреНрд╡рд░рд┐рдд рдбрд╛рдЙрдирд▓реЛрдб рдФрд░ рдЙрдкрдпреЛрдЧ рдХреЗ рд▓рд┐рдП рдПрдХ рдПрдкреАрдЖрдИ рдкреНрд░рджрд╛рди рдХрд░рддрд╛ рд╣реИ, рдЬрд┐рд╕рд╕реЗ рдЖрдк рдХрд┐рд╕реА рджрд┐рдП рдЧрдП рдкрд╛рда рдкрд░ рдПрдХ рдкреВрд░реНрд╡-рдкреНрд░рд╢рд┐рдХреНрд╖рд┐рдд рдореЙрдбрд▓ рд▓реЗ рд╕рдХрддреЗ рд╣реИрдВ, рдЗрд╕реЗ рдЕрдкрдиреЗ рдбреЗрдЯрд╛рд╕реЗрдЯ рдкрд░ рдареАрдХ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ рдФрд░ рдЗрд╕реЗ [рдореЙрдбрд▓ рд╣рдм] (https://huggingface.co/models) рдХреЗ рдорд╛рдзреНрдпрдо рд╕реЗ рд╕рдореБрджрд╛рдп рдХреЗ рд╕рд╛рде рд╕рд╛рдЭрд╛ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВред ) . рдЗрд╕реА рд╕рдордп, рдкреНрд░рддреНрдпреЗрдХ рдкрд░рд┐рднрд╛рд╖рд┐рдд рдкрд╛рдпрдерди рдореЙрдбреНрдпреВрд▓ рдкреВрд░реА рддрд░рд╣ рд╕реЗ рд╕реНрд╡рддрдВрддреНрд░ рд╣реИ, рдЬреЛ рд╕рдВрд╢реЛрдзрди рдФрд░ рддреЗрдЬреА рд╕реЗ рдЕрдиреБрд╕рдВрдзрд╛рди рдкреНрд░рдпреЛрдЧреЛрдВ рдХреЗ рд▓рд┐рдП рд╕реБрд╡рд┐рдзрд╛рдЬрдирдХ рд╣реИред
|
||||
ЁЯдЧ Transformers рддреНрд╡рд░рд┐рдд рдбрд╛рдЙрдирд▓реЛрдб рдФрд░ рдЙрдкрдпреЛрдЧ рдХреЗ рд▓рд┐рдП рдПрдХ рдПрдкреАрдЖрдИ рдкреНрд░рджрд╛рди рдХрд░рддрд╛ рд╣реИ, рдЬрд┐рд╕рд╕реЗ рдЖрдк рдХрд┐рд╕реА рджрд┐рдП рдЧрдП рдкрд╛рда рдкрд░ рдПрдХ рдкреВрд░реНрд╡-рдкреНрд░рд╢рд┐рдХреНрд╖рд┐рдд рдореЙрдбрд▓ рд▓реЗ рд╕рдХрддреЗ рд╣реИрдВ, рдЗрд╕реЗ рдЕрдкрдиреЗ рдбреЗрдЯрд╛рд╕реЗрдЯ рдкрд░ рдареАрдХ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ рдФрд░ рдЗрд╕реЗ [рдореЙрдбрд▓ рд╣рдм](https://huggingface.co/models) рдХреЗ рдорд╛рдзреНрдпрдо рд╕реЗ рд╕рдореБрджрд╛рдп рдХреЗ рд╕рд╛рде рд╕рд╛рдЭрд╛ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВред рдЗрд╕реА рд╕рдордп, рдкреНрд░рддреНрдпреЗрдХ рдкрд░рд┐рднрд╛рд╖рд┐рдд рдкрд╛рдпрдерди рдореЙрдбреНрдпреВрд▓ рдкреВрд░реА рддрд░рд╣ рд╕реЗ рд╕реНрд╡рддрдВрддреНрд░ рд╣реИ, рдЬреЛ рд╕рдВрд╢реЛрдзрди рдФрд░ рддреЗрдЬреА рд╕реЗ рдЕрдиреБрд╕рдВрдзрд╛рди рдкреНрд░рдпреЛрдЧреЛрдВ рдХреЗ рд▓рд┐рдП рд╕реБрд╡рд┐рдзрд╛рдЬрдирдХ рд╣реИред
|
||||
|
||||
ЁЯдЧ Transformers рддреАрди рд╕рдмрд╕реЗ рд▓реЛрдХрдкреНрд░рд┐рдп рдЧрд╣рди рд╢рд┐рдХреНрд╖рдг рдкреБрд╕реНрддрдХрд╛рд▓рдпреЛрдВ рдХрд╛ рд╕рдорд░реНрдерди рдХрд░рддрд╛ рд╣реИя╝Ъ [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) тАФ рдФрд░ рдЗрд╕рдХреЗ рд╕рд╛рде рдирд┐рд░реНрдмрд╛рдз рд░реВрдк рд╕реЗ рдПрдХреАрдХреГрдд рд╣реЛрддрд╛ рд╣реИред рдЖрдк рдЕрдкрдиреЗ рдореЙрдбрд▓ рдХреЛ рд╕реАрдзреЗ рдПрдХ рдврд╛рдВрдЪреЗ рдХреЗ рд╕рд╛рде рдкреНрд░рд╢рд┐рдХреНрд╖рд┐рдд рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ рдФрд░ рджреВрд╕рд░реЗ рдХреЗ рд╕рд╛рде рд▓реЛрдб рдФрд░ рдЕрдиреБрдорд╛рди рд▓рдЧрд╛ рд╕рдХрддреЗ рд╣реИрдВред
|
||||
|
||||
## рдСрдирд▓рд╛рдЗрди рдбреЗрдореЛ
|
||||
|
||||
рдЖрдк рд╕рдмрд╕реЗ рд╕реАрдзреЗ рдореЙрдбрд▓ рдкреГрд╖реНрда рдкрд░ рдкрд░реАрдХреНрд╖рдг рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ [model hub](https://huggingface.co/models) рдореЙрдбрд▓ рдкрд░ред рд╣рдо [рдирд┐рдЬреА рдореЙрдбрд▓ рд╣реЛрд╕реНрдЯрд┐рдВрдЧ, рдореЙрдбрд▓ рд╕рдВрд╕реНрдХрд░рдг, рдФрд░ рдЕрдиреБрдорд╛рди рдПрдкреАрдЖрдИ] рднреА рдкреНрд░рджрд╛рди рдХрд░рддреЗ рд╣реИрдВред(https://huggingface.co/pricing)уАВ
|
||||
рдЖрдк рд╕рдмрд╕реЗ рд╕реАрдзреЗ рдореЙрдбрд▓ рдкреГрд╖реНрда рдкрд░ рдкрд░реАрдХреНрд╖рдг рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ [model hub](https://huggingface.co/models) рдореЙрдбрд▓ рдкрд░ред рд╣рдо [рдирд┐рдЬреА рдореЙрдбрд▓ рд╣реЛрд╕реНрдЯрд┐рдВрдЧ, рдореЙрдбрд▓ рд╕рдВрд╕реНрдХрд░рдг, рдФрд░ рдЕрдиреБрдорд╛рди рдПрдкреАрдЖрдИ](https://huggingface.co/pricing) рднреА рдкреНрд░рджрд╛рди рдХрд░рддреЗ рд╣реИрдВредуАВ
|
||||
|
||||
рдпрд╣рд╛рдБ рдХреБрдЫ рдЙрджрд╛рд╣рд░рдг рд╣реИрдВя╝Ъ
|
||||
- [рд╢рдмреНрдж рдХреЛ рднрд░рдиреЗ рдХреЗ рд▓рд┐рдП рдорд╛рд╕реНрдХ рдХреЗ рд░реВрдк рдореЗрдВ BERT рдХрд╛ рдкреНрд░рдпреЛрдЧ рдХрд░реЗрдВ](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
|
||||
@ -165,7 +166,7 @@ checkpoint: рдЬрд╛рдБрдЪ рдмрд┐рдВрджреБ
|
||||
|
||||
рдЯреЛрдХрдирдирд╛рдЗрдЬрд╝рд░ рд╕рднреА рдкреВрд░реНрд╡-рдкреНрд░рд╢рд┐рдХреНрд╖рд┐рдд рдореЙрдбрд▓реЛрдВ рдХреЗ рд▓рд┐рдП рдкреНрд░реАрдкреНрд░реЛрд╕реЗрд╕рд┐рдВрдЧ рдкреНрд░рджрд╛рди рдХрд░рддрд╛ рд╣реИ рдФрд░ рдЗрд╕реЗ рд╕реАрдзреЗ рдПрдХ рд╕реНрдЯреНрд░рд┐рдВрдЧ (рдЬреИрд╕реЗ рдКрдкрд░ рджрд┐рдП рдЧрдП рдЙрджрд╛рд╣рд░рдг) рдпрд╛ рдХрд┐рд╕реА рд╕реВрдЪреА рдкрд░ рдмреБрд▓рд╛рдпрд╛ рдЬрд╛ рд╕рдХрддрд╛ рд╣реИред рдпрд╣ рдПрдХ рдбрд┐рдХреНрд╢рдирд░реА (рддрд╛рдирд╛рд╢рд╛рд╣реА) рдХреЛ рдЖрдЙрдЯрдкреБрдЯ рдХрд░рддрд╛ рд╣реИ рдЬрд┐рд╕реЗ рдЖрдк рдбрд╛рдЙрдирд╕реНрдЯреНрд░реАрдо рдХреЛрдб рдореЗрдВ рдЙрдкрдпреЛрдЧ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ рдпрд╛ `**` рдЕрдирдкреИрдХрд┐рдВрдЧ рдПрдХреНрд╕рдкреНрд░реЗрд╢рди рдХреЗ рдорд╛рдзреНрдпрдо рд╕реЗ рд╕реАрдзреЗ рдореЙрдбрд▓ рдХреЛ рдкрд╛рд╕ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВред
|
||||
|
||||
рдореЙрдбрд▓ рд╕реНрд╡рдпрдВ рдПрдХ рдирд┐рдпрдорд┐рдд [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) рдпрд╛ [TensorFlow `tf.keras.Model`](https ://pytorch.org/docs/stable/nn.html#torch.nn.Module) ://www.tensorflow.org/api_docs/python/tf/keras/Model) (рдЖрдкрдХреЗ рдмреИрдХрдПрдВрдб рдХреЗ рдЖрдзрд╛рд░ рдкрд░), рдЬреЛ рд╣реЛ рд╕рдХрддрд╛ рд╣реИ рд╕рд╛рдорд╛рдиреНрдп рддрд░реАрдХреЗ рд╕реЗ рдЙрдкрдпреЛрдЧ рдХрд┐рдпрд╛ рдЬрд╛рддрд╛ рд╣реИред [рдпрд╣ рдЯреНрдпреВрдЯреЛрд░рд┐рдпрд▓](https://huggingface.co/transformers/training.html) рдмрддрд╛рддрд╛ рд╣реИ рдХрд┐ рдЗрд╕ рддрд░рд╣ рдХреЗ рдореЙрдбрд▓ рдХреЛ рдХреНрд▓рд╛рд╕рд┐рдХ PyTorch рдпрд╛ TensorFlow рдкреНрд░рд╢рд┐рдХреНрд╖рдг рд▓реВрдк рдореЗрдВ рдХреИрд╕реЗ рдПрдХреАрдХреГрдд рдХрд┐рдпрд╛ рдЬрд╛рдП, рдпрд╛ рд╣рдорд╛рд░реЗ `рдЯреНрд░реЗрдирд░` рдПрдкреАрдЖрдИ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХреИрд╕реЗ рдХрд░реЗрдВ рддрд╛рдХрд┐ рдЗрд╕реЗ рдЬрд▓реНрджреА рд╕реЗ рдлрд╝рд╛рдЗрди рдЯреНрдпреВрди рдХрд┐рдпрд╛ рдЬрд╛ рд╕рдХреЗредрдПрдХ рдирдпрд╛ рдбреЗрдЯрд╛рд╕реЗрдЯ рдкреЗред
|
||||
рдореЙрдбрд▓ рд╕реНрд╡рдпрдВ рдПрдХ рдирд┐рдпрдорд┐рдд [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) рдпрд╛ [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (рдЖрдкрдХреЗ рдмреИрдХрдПрдВрдб рдХреЗ рдЖрдзрд╛рд░ рдкрд░), рдЬреЛ рд╣реЛ рд╕рдХрддрд╛ рд╣реИ рд╕рд╛рдорд╛рдиреНрдп рддрд░реАрдХреЗ рд╕реЗ рдЙрдкрдпреЛрдЧ рдХрд┐рдпрд╛ рдЬрд╛рддрд╛ рд╣реИред [рдпрд╣ рдЯреНрдпреВрдЯреЛрд░рд┐рдпрд▓](https://huggingface.co/transformers/training.html) рдмрддрд╛рддрд╛ рд╣реИ рдХрд┐ рдЗрд╕ рддрд░рд╣ рдХреЗ рдореЙрдбрд▓ рдХреЛ рдХреНрд▓рд╛рд╕рд┐рдХ PyTorch рдпрд╛ TensorFlow рдкреНрд░рд╢рд┐рдХреНрд╖рдг рд▓реВрдк рдореЗрдВ рдХреИрд╕реЗ рдПрдХреАрдХреГрдд рдХрд┐рдпрд╛ рдЬрд╛рдП, рдпрд╛ рд╣рдорд╛рд░реЗ `рдЯреНрд░реЗрдирд░` рдПрдкреАрдЖрдИ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХреИрд╕реЗ рдХрд░реЗрдВ рддрд╛рдХрд┐ рдЗрд╕реЗ рдЬрд▓реНрджреА рд╕реЗ рдлрд╝рд╛рдЗрди рдЯреНрдпреВрди рдХрд┐рдпрд╛ рдЬрд╛ рд╕рдХреЗредрдПрдХ рдирдпрд╛ рдбреЗрдЯрд╛рд╕реЗрдЯ рдкреЗред
|
||||
|
||||
## рдЯреНрд░рд╛рдВрд╕рдлрд╛рд░реНрдорд░ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХреНрдпреЛрдВ рдХрд░реЗрдВ?
|
||||
|
||||
@ -206,7 +207,9 @@ checkpoint: рдЬрд╛рдБрдЪ рдмрд┐рдВрджреБ
|
||||
|
||||
рд╕рдмрд╕реЗ рдкрд╣рд▓реЗ, рдкрд╛рдпрдерди рдХреЗ рдЙрд╕ рд╕рдВрд╕реНрдХрд░рдг рдХреЗ рд╕рд╛рде рдПрдХ рдЖрднрд╛рд╕реА рд╡рд╛рддрд╛рд╡рд░рдг рдмрдирд╛рдПрдВ рдЬрд┐рд╕рдХрд╛ рдЖрдк рдЙрдкрдпреЛрдЧ рдХрд░рдиреЗ рдФрд░ рдЙрд╕реЗ рд╕рдХреНрд░рд┐рдп рдХрд░рдиреЗ рдХреА рдпреЛрдЬрдирд╛ рдмрдирд╛ рд░рд╣реЗ рд╣реИрдВред
|
||||
|
||||
рдлрд┐рд░, рдЖрдкрдХреЛ Flax, PyTorch рдпрд╛ TensorFlow рдореЗрдВ рд╕реЗ рдХрд┐рд╕реА рдПрдХ рдХреЛ рд╕реНрдерд╛рдкрд┐рдд рдХрд░рдиреЗ рдХреА рдЖрд╡рд╢реНрдпрдХрддрд╛ рд╣реИред рдЕрдкрдиреЗ рдкреНрд▓реЗрдЯрдлрд╝реЙрд░реНрдо рдкрд░ рдЗрди рдлрд╝реНрд░реЗрдорд╡рд░реНрдХ рдХреЛ рд╕реНрдерд╛рдкрд┐рдд рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП, [TensorFlow рд╕реНрдерд╛рдкрдирд╛ рдкреГрд╖реНрда](https://www.tensorflow.org/install/), [PyTorch рд╕реНрдерд╛рдкрдирд╛ рдкреГрд╖реНрда](https://pytorch.org/get-started /locally/# рджреЗрдЦреЗрдВ) start-locally) рдпрд╛ [Flax рд╕реНрдерд╛рдкрдирд╛ рдкреГрд╖реНрда](https://github.com/google/flax#quick-install).
|
||||
рдлрд┐рд░, рдЖрдкрдХреЛ Flax, PyTorch рдпрд╛ TensorFlow рдореЗрдВ рд╕реЗ рдХрд┐рд╕реА рдПрдХ рдХреЛ рд╕реНрдерд╛рдкрд┐рдд рдХрд░рдиреЗ рдХреА рдЖрд╡рд╢реНрдпрдХрддрд╛ рд╣реИред рдЕрдкрдиреЗ рдкреНрд▓реЗрдЯрдлрд╝реЙрд░реНрдо рдкрд░ рдЗрди рдлрд╝реНрд░реЗрдорд╡рд░реНрдХ рдХреЛ рд╕реНрдерд╛рдкрд┐рдд рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП, [TensorFlow рд╕реНрдерд╛рдкрдирд╛ рдкреГрд╖реНрда](https://www.tensorflow.org/install/), [PyTorch рд╕реНрдерд╛рдкрдирд╛ рдкреГрд╖реНрда](https://pytorch.org/get-started/locally)
|
||||
|
||||
рджреЗрдЦреЗрдВ start-locally рдпрд╛ [Flax рд╕реНрдерд╛рдкрдирд╛ рдкреГрд╖реНрда](https://github.com/google/flax#quick-install).
|
||||
|
||||
рдЬрдм рдЗрдирдореЗрдВ рд╕реЗ рдХреЛрдИ рдПрдХ рдмреИрдХрдПрдВрдб рд╕рдлрд▓рддрд╛рдкреВрд░реНрд╡рдХ рд╕реНрдерд╛рдкрд┐рдд рд╣реЛ рдЬрд╛рддрд╛ рд╣реИ, рддреЛ рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░ рдирд┐рдореНрдирд╛рдиреБрд╕рд╛рд░ рд╕реНрдерд╛рдкрд┐рдд рдХрд┐рдП рдЬрд╛ рд╕рдХрддреЗ рд╣реИрдВ:
|
||||
|
||||
@ -214,7 +217,7 @@ checkpoint: рдЬрд╛рдБрдЪ рдмрд┐рдВрджреБ
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
рдпрджрд┐ рдЖрдк рдЙрдкрдпреЛрдЧ рдХреЗ рдорд╛рдорд▓реЛрдВ рдХреЛ рдЖрдЬрд╝рдорд╛рдирд╛ рдЪрд╛рд╣рддреЗ рд╣реИрдВ рдпрд╛ рдЖрдзрд┐рдХрд╛рд░рд┐рдХ рд░рд┐рд▓реАрдЬрд╝ рд╕реЗ рдкрд╣рд▓реЗ рдирд╡реАрдирддрдо рдЗрди-рдбреЗрд╡рд▓рдкрдореЗрдВрдЯ рдХреЛрдб рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛ рдЪрд╛рд╣рддреЗ рд╣реИрдВ, рддреЛ рдЖрдкрдХреЛ [рд╕реЛрд░реНрд╕ рд╕реЗ рдЗрдВрд╕реНрдЯреЙрд▓ рдХрд░рдирд╛ рд╣реЛрдЧрд╛](https://huggingface.co/docs/transformers/installation#installing-from- рд╕реНрд░реЛрдд)ред
|
||||
рдпрджрд┐ рдЖрдк рдЙрдкрдпреЛрдЧ рдХреЗ рдорд╛рдорд▓реЛрдВ рдХреЛ рдЖрдЬрд╝рдорд╛рдирд╛ рдЪрд╛рд╣рддреЗ рд╣реИрдВ рдпрд╛ рдЖрдзрд┐рдХрд╛рд░рд┐рдХ рд░рд┐рд▓реАрдЬрд╝ рд╕реЗ рдкрд╣рд▓реЗ рдирд╡реАрдирддрдо рдЗрди-рдбреЗрд╡рд▓рдкрдореЗрдВрдЯ рдХреЛрдб рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛ рдЪрд╛рд╣рддреЗ рд╣реИрдВ, рддреЛ рдЖрдкрдХреЛ [рд╕реЛрд░реНрд╕ рд╕реЗ рдЗрдВрд╕реНрдЯреЙрд▓ рдХрд░рдирд╛ рд╣реЛрдЧрд╛](https://huggingface.co/docs/transformers/installation#installing-from-) рд╕реНрд░реЛрддред
|
||||
|
||||
### рдХреЛрдВрдбрд╛ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛
|
||||
|
||||
@ -229,7 +232,7 @@ conda install -c huggingface transformers
|
||||
рдХреЛрдВрдбрд╛ рдХреЗ рдорд╛рдзреНрдпрдо рд╕реЗ Flax, PyTorch, рдпрд╛ TensorFlow рдореЗрдВ рд╕реЗ рдХрд┐рд╕реА рдПрдХ рдХреЛ рд╕реНрдерд╛рдкрд┐рдд рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП, рдирд┐рд░реНрджреЗрд╢реЛрдВ рдХреЗ рд▓рд┐рдП рдЙрдирдХреЗ рд╕рдВрдмрдВрдзрд┐рдд рд╕реНрдерд╛рдкрдирд╛ рдкреГрд╖реНрда рджреЗрдЦреЗрдВред
|
||||
|
||||
## рдореЙрдбрд▓ рдЖрд░реНрдХрд┐рдЯреЗрдХреНрдЪрд░
|
||||
[рдЙрдкрдпреЛрдЧрдХрд░реНрддрд╛](https://huggingface.co/users) рдФрд░ [organization](https://huggingface.co) рджреНрд╡рд╛рд░рд╛ рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░ рд╕рдорд░реНрдерд┐рдд [**рд╕рднреА рдореЙрдбрд▓ рдЪреМрдХрд┐рдпреЛрдВ**](https://huggingface.co/models) /users) рд╣рдЧрд┐рдВрдЧрдлреЗрд╕.рдХреЛ/рдСрд░реНрдЧрдирд╛рдЗрдЬреЗрд╢рди), рд╕рднреА рдХреЛ рдмрд┐рдирд╛ рдХрд┐рд╕реА рдмрд╛рдзрд╛ рдХреЗ рд╣рдЧрд┐рдВрдЧрдлреЗрд╕.рдХреЛ [рдореЙрдбрд▓ рд╣рдм](https://huggingface.co) рдХреЗ рд╕рд╛рде рдПрдХреАрдХреГрдд рдХрд┐рдпрд╛ рдЧрдпрд╛ рд╣реИред
|
||||
[рдЙрдкрдпреЛрдЧрдХрд░реНрддрд╛](https://huggingface.co/users) рдФрд░ [organization](https://huggingface.co) рджреНрд╡рд╛рд░рд╛ рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░ рд╕рдорд░реНрдерд┐рдд [**рд╕рднреА рдореЙрдбрд▓ рдЪреМрдХрд┐рдпреЛрдВ**](https://huggingface.co/models/users) рд╣рдЧрд┐рдВрдЧрдлреЗрд╕.рдХреЛ/рдСрд░реНрдЧрдирд╛рдЗрдЬреЗрд╢рди), рд╕рднреА рдХреЛ рдмрд┐рдирд╛ рдХрд┐рд╕реА рдмрд╛рдзрд╛ рдХреЗ рд╣рдЧрд┐рдВрдЧрдлреЗрд╕.рдХреЛ [рдореЙрдбрд▓ рд╣рдм](https://huggingface.co) рдХреЗ рд╕рд╛рде рдПрдХреАрдХреГрдд рдХрд┐рдпрд╛ рдЧрдпрд╛ рд╣реИред
|
||||
|
||||
рдЪреМрдХрд┐рдпреЛрдВ рдХреА рд╡рд░реНрддрдорд╛рди рд╕рдВрдЦреНрдпрд╛: 
|
||||
|
||||
@ -310,6 +313,7 @@ conda install -c huggingface transformers
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (рдЧреВрдЧрд▓ рд░рд┐рд╕рд░реНрдЪ рд╕реЗ) рд╕рд╛рде рд╡рд╛рд▓рд╛ рдкреЗрдкрд░ [FNet: рдорд┐рдХреНрд╕рд┐рдВрдЧ рдЯреЛрдХрди рд╡рд┐рдж рдлреВрд░рд┐рдпрд░ рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдореНрд╕](https://arxiv.org /abs/2105.03824) рдЬреЗрдореНрд╕ рд▓реА-рдереЙрд░реНрдк, рдЬреЛрд╢реБрдЖ рдЖрдЗрдВрд╕реНрд▓реА, рдЗрд▓реНрдпрд╛ рдПрдХрд╕реНрдЯреАрди, рд╕реИрдВрдЯрд┐рдпрд╛рдЧреЛ рдУрдВрдЯрд╛рдирди рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research рд╕реЗ) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (рд╕реАрдПрдордпреВ/рдЧреВрдЧрд▓ рдмреНрд░реЗрди рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдХрд╛рдЧрдЬ [рдлрд╝рдирд▓-рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░: рдХреБрд╢рд▓ рднрд╛рд╖рд╛ рдкреНрд░рд╕рдВрд╕реНрдХрд░рдг рдХреЗ рд▓рд┐рдП рдЕрдиреБрдХреНрд░рдорд┐рдХ рдЕрддрд┐рд░реЗрдХ рдХреЛ рдЫрд╛рдирдирд╛](https://arxiv.org/abs/2006.03236) рдЬрд┐рд╣рд╛рдВрдЧ рджрд╛рдИ, рдЧреБрдУрдХреБрди рд▓рд╛рдИ, рдпрд┐рдорд┐рдВрдЧ рдпрд╛рдВрдЧ, рдХреНрд╡реЛрдХ рд╡реА. рд▓реЗ тАЛтАЛрджреНрд╡рд╛рд░рд╛ рд░рд┐рд╣рд╛рдИред
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (ADEPT рд╕реЗ) рд░реЛрд╣рди рдмрд╛рд╡рд┐рд╢реА, рдПрд░рд┐рдЪ рдПрд▓рд╕реЗрди, рдХрд░реНрдЯрд┐рд╕ рд╣реЙрдереЛрд░реНрди, рдореИрдХреНрд╕рд╡реЗрд▓ рдиреА, рдСрдЧрд╕реНрдЯрд╕ рдУрдбреЗрдирд╛, рдЕрд░реБрд╢реА рд╕реЛрдорд╛рдиреА, рд╕рд╛рдЧрдирд╛рдХ рддрд╛рд╕рд┐рд░рд▓рд╛рд░ [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST рд╕реЗ) рд╕рд╛рде рд╡рд╛рд▓рд╛ рдкреЗрдкрд░ [рд╡рд░реНрдЯрд┐рдХрд▓ рдХрдЯрдбреЗрдкреНрде рдХреЗ рд╕рд╛рде рдореЛрдиреЛрдХреБрд▓рд░ рдбреЗрдкреНрде рдПрд╕реНрдЯреАрдореЗрд╢рди рдХреЗ рд▓рд┐рдП рдЧреНрд▓реЛрдмрд▓-рд▓реЛрдХрд▓ рдкрд╛рде рдиреЗрдЯрд╡рд░реНрдХреНрд╕](https:/ /arxiv.org/abs/2201.07436) рдбреЛрдпреЛрди рдХрд┐рдо, рд╡реВрдВрдЧрд╣реНрдпреБрди рдЧрд╛, рдкреНрдпреБрдВрдЧрд╡рд╛рди рдЖрд╣, рдбреЛрдВрдЧрдЧреНрдпреВ рдЬреВ, рд╕реЗрд╣рд╡рд╛рди рдЪреБрди, рдЬреБрдирдореЛ рдХрд┐рдо рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI рд╕реЗ) рд╕рд╛рде рдореЗрдВ рджрд┐рдпрд╛ рдЧрдпрд╛ рдкреЗрдкрд░ [рдЬреЗрдирд░реЗрдЯрд┐рд╡ рдкреНрд░реА-рдЯреНрд░реЗрдирд┐рдВрдЧ рджреНрд╡рд╛рд░рд╛ рднрд╛рд╖рд╛ рдХреА рд╕рдордЭ рдореЗрдВ рд╕реБрдзрд╛рд░](https://blog .openai.com/language-unsupervised/) рдПрд▓реЗрдХ рд░реИрдбрдлреЛрд░реНрдб, рдХрд╛рд░реНрддрд┐рдХ рдирд░рд╕рд┐рдореНрд╣рди, рдЯрд┐рдо рд╕рд╛рд▓рд┐рдордиреНрд╕ рдФрд░ рдЗрд▓реНрдпрд╛ рд╕реБрддреНрд╕реНрдХреЗрд╡рд░ рджреНрд╡рд╛рд░рд╛ред
|
||||
@ -331,6 +335,7 @@ conda install -c huggingface transformers
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce рд╕реЗ) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (рдорд╛рдЗрдХреНрд░реЛрд╕реЙрдлреНрдЯ рд░рд┐рд╕рд░реНрдЪ рдПрд╢рд┐рдпрд╛ рд╕реЗ) рд╕рд╛рде рджреЗрдиреЗ рд╡рд╛рд▓рд╛ рдкреЗрдкрд░ [рд▓реЗрдЖрдЙрдЯрдПрд▓рдПрдорд╡реА3: рдпреВрдирд┐рдлрд╛рдЗрдб рдЯреЗрдХреНрд╕реНрдЯ рдФрд░ рдЗрдореЗрдЬ рдорд╛рд╕реНрдХрд┐рдВрдЧ рдХреЗ рд╕рд╛рде рджрд╕реНрддрд╛рд╡реЗрдЬрд╝ рдПрдЖрдИ рдХреЗ рд▓рд┐рдП рдкреВрд░реНрд╡-рдкреНрд░рд╢рд┐рдХреНрд╖рдг](https://arxiv.org/abs/2204.08387) рдпреБрдкрди рд╣реБрдЖрдВрдЧ, рдЯреЗрдВрдЧрдЪрд╛рдУ рд▓рд╡, рд▓реЗрдИ рдХреБрдИ, рдпреБрдЯреЛрдВрдЧ рд▓реВ, рдлреБрд░реБ рд╡реЗрдИ рджреНрд╡рд╛рд░рд╛ рдкреЛрд╕реНрдЯ рдХрд┐рдпрд╛ рдЧрдпрд╛ред
|
||||
@ -357,6 +362,7 @@ conda install -c huggingface transformers
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA рд╕реЗ) рдХрд╛рдЧрдЬ рдХреЗ рд╕рд╛рде [Megatron-LM: рдореЙрдбрд▓ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдХреЗ рдмрд╣реБ-рдЕрд░рдм рдкреИрд░рд╛рдореАрдЯрд░ рднрд╛рд╖рд╛ рдореЙрдбрд▓ рдХрд╛ рдкреНрд░рд╢рд┐рдХреНрд╖рдг Parallelism](https://arxiv.org/abs/1909.08053) рдореЛрд╣рдореНрдордж рд╢реЛрдПрдмреА, рдореЛрд╕реНрдЯреЛрдлрд╛ рдкрдЯрд╡рд╛рд░реА, рд░рд╛рдЙрд▓ рдкреБрд░реА, рдкреИрдЯреНрд░рд┐рдХ рд▓реЗрдЧреНрд░реЗрд╕реНрд▓реЗ, рдЬреЗрд░реЗрдб рдХреИрд╕реНрдкрд░ рдФрд░ рдмреНрд░рд╛рдпрди рдХреИрдЯрд╛рдирдЬрд╝рд╛рд░реЛ рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA рд╕реЗ) рд╕рд╛рде рд╡рд╛рд▓рд╛ рдкреЗрдкрд░ [Megatron-LM: рдЯреНрд░реЗрдирд┐рдВрдЧ рдорд▓реНрдЯреА-рдмрд┐рд▓рд┐рдпрди рдкреИрд░рд╛рдореАрдЯрд░ рд▓реИрдВрдЧреНрд╡реЗрдЬ рдореЙрдбрд▓реНрд╕ рдпреВрдЬрд┐рдВрдЧ рдореЙрдбрд▓ рдкреИрд░реЗрд▓рд▓рд┐рдЬрд╝реНрдо] (https://arxiv.org/abs/1909.08053) рдореЛрд╣рдореНрдордж рд╢реЛрдПрдмреА, рдореЛрд╕реНрдЯреЛрдлрд╛ рдкрдЯрд╡рд╛рд░реА, рд░рд╛рдЙрд▓ рдкреБрд░реА, рдкреИрдЯреНрд░рд┐рдХ рд▓реЗрдЧреНрд░реЗрд╕реНрд▓реЗ, рдЬреЗрд░реЗрдб рдХреИрд╕реНрдкрд░ рдФрд░ рдмреНрд░рд╛рдпрди рдХреИрдЯрд╛рдирдЬрд╝рд╛рд░реЛ рджреНрд╡рд╛рд░рд╛ рдкреЛрд╕реНрдЯ рдХрд┐рдпрд╛ рдЧрдпрд╛ред
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research рд╕реЗ) Peng Wang, Cheng Da, and Cong Yao. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed..
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (рдлреНрд░реЙрдо Studio Ousia) рд╕рд╛рде рдореЗрдВ рдкреЗрдкрд░ [mLUKE: рдж рдкрд╛рд╡рд░ рдСрдл рдПрдВрдЯрд┐рдЯреА рд░рд┐рдкреНрд░реЗрдЬреЗрдВрдЯреЗрд╢рди рдЗрди рдорд▓реНрдЯреАрд▓рд┐рдВрдЧреБрдЕрд▓ рдкреНрд░реАрдЯреНрд░реЗрдиреНрдб рд▓реИрдВрдЧреНрд╡реЗрдЬ рдореЙрдбрд▓реНрд╕](https://arxiv.org/abs/2110.08151) рд░рдпреЛрдХрди рд░реА, рдЗрдХреБрдпрд╛ рдпрд╛рдорд╛рдбрд╛, рдФрд░ рдпреЛрд╢рд┐рдорд╛рд╕рд╛ рддреНрд╕реБрд░реЛрдХрд╛ рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook рд╕реЗ) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (рд╕реАрдПрдордпреВ/рдЧреВрдЧрд▓ рдмреНрд░реЗрди рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдХрд╛рдЧрдЬ [рдореЛрдмрд╛рдЗрд▓рдмрд░реНрдЯ: рд╕рдВрд╕рд╛рдзрди-рд╕реАрдорд┐рдд рдЙрдкрдХрд░рдгреЛрдВ рдХреЗ рд▓рд┐рдП рдПрдХ рдХреЙрдореНрдкреИрдХреНрдЯ рдЯрд╛рд╕реНрдХ-рдЕрдЬреНрдЮреЗрдп рдмреАрдИрдЖрд░рдЯреА] (https://arxiv.org/abs/2004.02984) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, рдФрд░ Denny Zhou рджреНрд╡рд╛рд░рд╛ рдкреЛрд╕реНрдЯ рдХрд┐рдпрд╛ рдЧрдпрд╛ред
|
||||
@ -374,15 +380,17 @@ conda install -c huggingface transformers
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (рд╣реБрдЖрд╡реЗрдИ рдиреВрд╣ рдХреЗ рдЖрд░реНрдХ рд▓реИрдм рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдХрд╛рдЧрдЬрд╝ [NEZHA: рдЪреАрдиреА рднрд╛рд╖рд╛ рд╕рдордЭ рдХреЗ рд▓рд┐рдП рддрдВрддреНрд░рд┐рдХрд╛ рдкреНрд░рд╛рд╕рдВрдЧрд┐рдХ рдкреНрд░рддрд┐рдирд┐рдзрд┐рддреНрд╡](https :/ /arxiv.org/abs/1909.00204) рдЬреБрдиреНрдХрд┐рдЙ рд╡реЗрдИ, рдЬрд╝рд┐рдпрд╛рдУрдЬрд╝реЗ рд░реЗрди, рдЬрд╝рд┐рдЖрдУрдЧреБрдЖрдВрдЧ рд▓реА, рд╡реЗрдирдпреЛрдВрдЧ рд╣реБрдЖрдВрдЧ, рдпреА рд▓рд┐рдпрд╛рдУ, рдпрд╛рд╢реЗрдВрдЧ рд╡рд╛рдВрдЧ, рдЬрд┐рдпрд╛рд╢реВ рд▓рд┐рди, рд╢рд┐рди рдЬрд┐рдпрд╛рдВрдЧ, рдЬрд┐рдУ рдЪреЗрди рдФрд░ рдХреБрди рд▓рд┐рдпреВ рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (рдлреНрд░реЙрдо рдореЗрдЯрд╛) рд╕рд╛рде рдореЗрдВ рдкреЗрдкрд░ [рдиреЛ рд▓реИрдВрдЧреНрд╡реЗрдЬ рд▓реЗрдлреНрдЯ рдмрд┐рд╣рд╛рдЗрдВрдб: рд╕реНрдХреЗрд▓рд┐рдВрдЧ рд╣реНрдпреВрдорди-рд╕реЗрдВрдЯреЗрдб рдорд╢реАрди рдЯреНрд░рд╛рдВрд╕рд▓реЗрд╢рди] (https://arxiv.org/abs/2207.04672) рдПрдирдПрд▓рдПрд▓рдмреА рдЯреАрдо рджреНрд╡рд╛рд░рд╛ рдкреНрд░рдХрд╛рд╢рд┐рддред
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta рд╕реЗ) the NLLB team. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI рд╕реЗ) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (рд╡рд┐рд╕реНрдХреЙрдиреНрд╕рд┐рди рд╡рд┐рд╢реНрд╡рд╡рд┐рджреНрдпрд╛рд▓рдп - рдореИрдбрд┐рд╕рди рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдХрд╛рдЧрдЬ [Nystr├╢mformer: A Nystr├╢m- рдЖрдзрд╛рд░рд┐рдд рдПрд▓реНрдЧреЛрд░рд┐рдердо рдЖрддреНрдо-рдзреНрдпрд╛рди рдХрд╛ рдЕрдиреБрдорд╛рди рд▓рдЧрд╛рдиреЗ рдХреЗ рд▓рд┐рдП ](https://arxiv.org/abs/2102.03902) рдпреБрдирдпрд╛рдВрдЧ рдЬрд╝рд┐рдУрдВрдЧ, рдЭрд╛рдирдкреЗрдВрдЧ рдЬрд╝реЗрдВрдЧ, рд░реБрджреНрд░рд╕рд┐рд╕ рдЪрдХреНрд░рд╡рд░реНрддреА, рдорд┐рдВрдЧрдХреНрд╕рд┐рдВрдЧ рдЯреИрди, рдЧреНрд▓реЗрди рдлрдВрдЧ, рдпрд┐рди рд▓реА, рд╡рд┐рдХрд╛рд╕ рд╕рд┐рдВрд╣ рджреНрд╡рд╛рд░рд╛ рдкреЛрд╕реНрдЯ рдХрд┐рдпрд╛ рдЧрдпрд╛ред
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs рд╕реЗ) рдкреЗрдкрд░ [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) рдЬрд┐рддреЗрд╢ рдЬреИрди, рдЬрд┐рдЖрдЪреЗрди рд▓реА, рдорд╛рдВрдЧрдЯрд┐рдХ рдЪрд┐рдЙ, рдЕрд▓реА рд╣рд╕рдиреА, рдирд┐рдХрд┐рддрд╛ рдУрд░рд▓реЛрд╡, рд╣рдореНрдлреНрд░реА рд╢рд┐ рдХреЗ рджреНрд╡рд╛рд░рд╛ рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛ рд╣реИред
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдХрд╛рдЧрдЬ [рд╡рд┐рдЬрд╝рди рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░реНрд╕ рдХреЗ рд╕рд╛рде рд╕рд┐рдВрдкрд▓ рдУрдкрди-рд╡реЛрдХреИрдмреБрд▓рд░реА рдСрдмреНрдЬреЗрдХреНрдЯ рдбрд┐рдЯреЗрдХреНрд╢рди](https:/ /arxiv.org/abs/2205.06230) рдореИрдерд┐рдпрд╛рд╕ рдорд┐рдВрдбрд░рд░, рдПрд▓реЗрдХреНрд╕реА рдЧреНрд░рд┐рдЯреНрд╕реЗрдВрдХреЛ, рдСрд╕реНрдЯрд┐рди рд╕реНрдЯреЛрди, рдореИрдХреНрд╕рд┐рдо рдиреНрдпреВрдореИрди, рдбрд┐рд░реНрдХ рд╡реАрд╕реЗрдирдмреЛрд░реНрди, рдПрд▓реЗрдХреНрд╕реА рдбреЛрд╕реЛрд╡рд┐рддреНрд╕реНрдХреА, рдЕрд░рд╡рд┐рдВрдж рдорд╣реЗрдВрджреНрд░рди, рдЕрдиреБрд░рд╛рдЧ рдЕрд░реНрдирдм, рдореБрд╕реНрддрдлрд╛ рджреЗрд╣рдШрд╛рдиреА, рдЬрд╝реБрдУрд░рди рд╢реЗрди, рдЬрд┐рдУ рд╡рд╛рдВрдЧ, рдЬрд╝рд┐рдпрд╛рдУрд╣реБрдЖ рдЭрд╛рдИ, рдереЙрдорд╕ рдХрд┐рдлрд╝, рдФрд░ рдиреАрд▓ рд╣реЙрд▓реНрд╕рдмреА рджреНрд╡рд╛рд░рд╛ рдкреЛрд╕реНрдЯ рдХрд┐рдпрд╛ рдЧрдпрд╛ред
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI рд╕реЗ) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google рдХреА рдУрд░ рд╕реЗ) рд╕рд╛рде рдореЗрдВ рджрд┐рдпрд╛ рдЧрдпрд╛ рдкреЗрдкрд░ [рд▓рдВрдмреЗ рдЗрдирдкреБрдЯ рд╕рд╛рд░рд╛рдВрд╢ рдХреЗ рд▓рд┐рдП рдЯреНрд░рд╛рдВрд╕рдлрд╝реЙрд░реНрдорд░реЛрдВ рдХреЛ рдмреЗрд╣рддрд░ рддрд░реАрдХреЗ рд╕реЗ рдПрдХреНрд╕рдЯреЗрдВрдб рдХрд░рдирд╛](https://arxiv .org/abs/2208.04347) рдЬреЗрд╕рди рдлрд╛рдВрдЧ, рдпрд╛рдУ рдЭрд╛рдУ, рдкреАрдЯрд░ рдЬреЗ рд▓рд┐рдпреВ рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (рджреАрдкрдорд╛рдЗрдВрдб рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдкреЗрдкрд░ [рдкрд░реНрд╕реАрд╡рд░ рдЖрдИрдУ: рд╕рдВрд░рдЪрд┐рдд рдЗрдирдкреБрдЯ рдФрд░ рдЖрдЙрдЯрдкреБрдЯ рдХреЗ рд▓рд┐рдП рдПрдХ рд╕рд╛рдорд╛рдиреНрдп рд╡рд╛рд╕реНрддреБрдХрд▓рд╛] (https://arxiv.org/abs/2107.14795) рдПрдВрдбреНрд░рдпреВ рдЬреЗрдЧрд▓, рд╕реЗрдмреЗрд╕реНрдЯрд┐рдпрди рдмреЛрд░рдЧреНрдпреВрдб, рдЬреАрди-рдмреИрдкреНрдЯрд┐рд╕реНрдЯ рдЕрд▓рд╛рдпрд░рд╛рдХ, рдХрд╛рд░реНрд▓ рдбреЛрд░реНрд╢, рдХреИрдЯрд▓рд┐рди рдЗрдУрдиреЗрд╕реНрдХреБ, рдбреЗрд╡рд┐рдб рджреНрд╡рд╛рд░рд╛ рдбрд┐рдВрдЧ, рд╕реНрдХрдВрдж рдХреЛрдкреНрдкреБрд▓рд╛, рдбреИрдирд┐рдпрд▓ рдЬрд╝реЛрд░рд╛рди, рдПрдВрдбреНрд░рдпреВ рдмреНрд░реЙрдХ, рдЗрд╡рд╛рди рд╢реЗрд▓рд╣реИрдорд░, рдУрд▓рд┐рд╡рд┐рдпрд░ рд╣реЗрдирд╛рдл, рдореИрдереНрдпреВ рдПрдоред рдмреЛрдЯреНрд╡рд┐рдирд┐рдХ, рдПрдВрдбреНрд░рдпреВ рдЬрд╝рд┐рд╕рд░рдореИрди, рдУрд░рд┐рдУрд▓ рд╡рд┐рдирд┐рдпрд▓реНрд╕, рдЬреЛрдЖрдУ рдХреИрд░реЗрд░рд╛ рджреНрд╡рд╛рд░рд╛ рдкреЛрд╕реНрдЯ рдХрд┐рдпрд╛ рдЧрдпрд╛ред
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (ADEPT рд╕реЗ) Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [blog post](https://www.adept.ai/blog/persimmon-8b) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT рд╕реЗ) Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [blog post](https://www.adept.ai/blog/persimmon-8b) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research рд╕реЗ) рдХрд╛рдЧрдЬ рдХреЗ рд╕рд╛рде [PhoBERT: рд╡рд┐рдпрддрдирд╛рдореА рдХреЗ рд▓рд┐рдП рдкреВрд░реНрд╡-рдкреНрд░рд╢рд┐рдХреНрд╖рд┐рдд рднрд╛рд╖рд╛ рдореЙрдбрд▓](https://www .aclweb.org/anthology/2020.findings-emnlp.92/) рдбреИрдЯ рдХреНрд╡реЛрдХ рдЧреБрдпреЗрди рдФрд░ рдЕрдиреНрд╣ рддреБрдЖрди рдЧреБрдпреЗрди рджреНрд╡рд╛рд░рд╛ рдкреЛрд╕реНрдЯ рдХрд┐рдпрд╛ рдЧрдпрд╛ред
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google рд╕реЗ) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP рд╕реЗ) рд╕рд╛рде рд╡рд╛рд▓рд╛ рдкреЗрдкрд░ [рдкреНрд░реЛрдЧреНрд░рд╛рдо рдЕрдВрдбрд░рд╕реНрдЯреИрдВрдбрд┐рдВрдЧ рдПрдВрдб рдЬреЗрдирд░реЗрд╢рди рдХреЗ рд▓рд┐рдП рдпреВрдирд┐рдлрд╛рдЗрдб рдкреНрд░реА-рдЯреНрд░реЗрдирд┐рдВрдЧ](https://arxiv .org/abs/2103.06333) рд╡рд╕реА рдЙрджреНрджреАрди рдЕрд╣рдордж, рд╕реИрдХрдд рдЪрдХреНрд░рд╡рд░реНрддреА, рдмреИрд╢рд╛рдЦреА рд░реЗ, рдХрд╛рдИ-рд╡реЗрдИ рдЪрд╛рдВрдЧ рджреНрд╡рд╛рд░рд╛ред
|
||||
@ -402,6 +410,7 @@ conda install -c huggingface transformers
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (рдЭреБрдИрдИ рдЯреЗрдХреНрдиреЛрд▓реЙрдЬреА рд╕реЗ), рд╕рд╛рде рдореЗрдВ рдкреЗрдкрд░ [рд░реЛрдлреЙрд░реНрдорд░: рд░реЛрдЯрд░реА рдкреЛрдЬрд┐рд╢рди рдПрдВрдмреЗрдбрд┐рдВрдЧ рдХреЗ рд╕рд╛рде рдПрдиреНрд╣рд╛рдВрд╕реНрдб рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░] (https://arxiv.org/pdf/2104.09864v1.pdf) рдЬрд┐рдпрд╛рдирд▓рд┐рди рд╕реБ рдФрд░ рдпреВ рд▓реВ рдФрд░ рд╢реЗрдВрдЧрдлреЗрдВрдЧ рдкреИрди рдФрд░ рдмреЛ рд╡реЗрди рдФрд░ рдпреБрдирдлреЗрдВрдЧ рд▓рд┐рдпреВ рджреНрд╡рд╛рд░рд╛ рдкреНрд░рдХрд╛рд╢рд┐рддред
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng рд╕реЗ) Bo Peng. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [this repo](https://github.com/BlinkDL/RWKV-LM) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI рд╕реЗ) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP рд╕реЗ) рд╕рд╛рде рджреЗрдиреЗ рд╡рд╛рд▓рд╛ рдкреЗрдкрд░ [рднрд╛рд╖рдг рдкрд╣рдЪрд╛рди рдХреЗ рд▓рд┐рдП рдЕрдирд╕реБрдкрд░рд╡рд╛рдЗрдЬреНрдб рдкреНрд░реА-рдЯреНрд░реЗрдирд┐рдВрдЧ рдореЗрдВ рдкрд░рдлреЙрд░реНрдореЗрдВрд╕-рдПрдлрд┐рд╢рд┐рдПрдВрд╕реА рдЯреНрд░реЗрдб-рдСрдлреНрд╕](https ://arxiv.org/abs/2109.06870) рдлреЗрд▓рд┐рдХреНрд╕ рд╡реВ, рдХреНрд╡рд╛рдВрдЧрдпреБрди рдХрд┐рдо, рдЬрд┐рдВрдЧ рдкреИрди, рдХреНрдпреВ рд╣рд╛рди, рдХрд┐рд▓рд┐рдпрди рдХреНрдпреВ. рд╡реЗрдирдмрд░реНрдЧрд░, рдпреЛрд╡ рдЖрд░реНрдЯрдЬрд╝реА рджреНрд╡рд╛рд░рд╛ред
|
||||
@ -440,7 +449,7 @@ conda install -c huggingface transformers
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI рд╕реЗ) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (рдореЗрдЯрд╛ рдПрдЖрдИ рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдХрд╛рдЧрдЬ [рдорд╛рд╕реНрдХрдб рдСрдЯреЛрдПрдиреНрдХреЛрдбрд░ рд╕реНрдХреЗрд▓реЗрдмрд▓ рд╡рд┐рдЬрди рд▓рд░реНрдирд░реНрд╕ рд╣реИрдВ](https://arxiv.org/ рдПрдмреНрд╕/2111.06377) рдХреИрдорд┐рдВрдЧ рд╣реЗ, рдЬрд╝рд┐рдиреЗрд▓реА рдЪреЗрди, рд╕реЗрдирд┐рдВрдЧ рдЬрд╝реА, рдпрд╛рдВрдЧрд╣реЛ рд▓реА, рдкрд┐рдУрдЯреНрд░ рдбреЙрд▓рд░, рд░реЙрд╕ рдЧрд┐рд░реНрд╢рд┐рдХ рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (HUST-VL рд╕реЗ) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL рд╕реЗ) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (рдореЗрдЯрд╛ рдПрдЖрдИ рд╕реЗ) рд╕рд╛рде рдореЗрдВ рдХрд╛рдЧрдЬ [рд▓реЗрдмрд▓-рдХреБрд╢рд▓ рд╕реАрдЦрдиреЗ рдХреЗ рд▓рд┐рдП рдорд╛рд╕реНрдХреНрдб рд╕реНрдпрд╛рдо рджреЗрд╢ рдХреЗ рдиреЗрдЯрд╡рд░реНрдХ](https://arxiv. org/abs/2204.07141) рдорд╣рдореВрдж рдЕрд╕рд░рд╛рди, рдордерд┐рд▓реНрдбреЗ рдХреИрд░рди, рдИрд╢рд╛рди рдорд┐рд╢реНрд░рд╛, рдкрд┐рдпреЛрдЯреНрд░ рдмреЛрдЬрд╛рдиреЛрд╡рд╕реНрдХреА, рдлреНрд▓реЛрд░рд┐рдпрди рдмреЛрд░реНрдбреЗрд╕, рдкрд╛рд╕реНрдХрд▓ рд╡рд┐рдВрд╕реЗрдВрдЯ, рдЖрд░реНрдордВрдб рдЬреМрд▓рд┐рди, рдорд╛рдЗрдХрд▓ рд░рдмреНрдмрдд, рдирд┐рдХреЛрд▓рд╕ рдмрд▓реНрд▓рд╛рд╕ рджреНрд╡рд╛рд░рд╛ред
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise рд╕реЗ) Jaehyeon Kim, Jungil Kong, Juhee Son. рджреНрд╡рд╛рд░рд╛рдЕрдиреБрд╕рдВрдзрд╛рди рдкрддреНрд░ [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) рдХреЗ рд╕рд╛рде рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
|
||||
19
README_ja.md
19
README_ja.md
@ -53,7 +53,7 @@ user: уГжуГ╝уВ╢
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||
<br>
|
||||
<p>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
@ -82,7 +82,8 @@ user: уГжуГ╝уВ╢
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<b>цЧецЬмшкЮ</b> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a>
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
@ -210,7 +211,7 @@ Hugging FaceуГБуГ╝уГауБлуВИуБгуБжф╜ЬуВЙуВМуБЯ **[уГИуГйуГ│уВ╣уГХуВйуГ╝уГЮуГ╝уВТ
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
And here is the equivalent code for TensorFlow:
|
||||
уБЭуБЧуБжуБУуБбуВЙуБпTensorFlowуБихРМчнЙуБоуВ│уГ╝уГЙуБиуБкуВКуБ╛уБЩ:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, TFAutoModel
|
||||
|
||||
@ -372,6 +373,7 @@ FlaxуАБPyTorchуАБTensorFlowуВТcondaуБзуВдуГ│уВ╣уГИуГ╝уГлуБЩуВЛцЦ╣ц│ХуБпуАБуБЭуВМ
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research уБЛуВЙ) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research уБЛуВЙ) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain уБЛуВЙ) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (ADEPT уБЛуВЙ) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research уБЛуВЙ) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST уБЛуВЙ) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI уБЛуВЙ) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)
|
||||
@ -393,6 +395,7 @@ FlaxуАБPyTorchуАБTensorFlowуВТcondaуБзуВдуГ│уВ╣уГИуГ╝уГлуБЩуВЛцЦ╣ц│ХуБпуАБуБЭуВМ
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce уБЛуВЙ) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI уБЛуВЙ) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
|
||||
1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia уБЛуВЙ) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia уБЛуВЙ) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia уБЛуВЙ) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387)
|
||||
@ -419,6 +422,7 @@ FlaxуАБPyTorchуАБTensorFlowуВТcondaуБзуВдуГ│уВ╣уГИуГ╝уГлуБЩуВЛцЦ╣ц│ХуБпуАБуБЭуВМ
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA уБЛуВЙ) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA уБЛуВЙ) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research уБЛуВЙ) Peng Wang, Cheng Da, and Cong Yao. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed..
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia уБЛуВЙ) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151)
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook уБЛуВЙ) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain уБЛуВЙ) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984)
|
||||
@ -436,15 +440,17 @@ FlaxуАБPyTorchуАБTensorFlowуВТcondaуБзуВдуГ│уВ╣уГИуГ╝уГлуБЩуВЛцЦ╣ц│ХуБпуАБуБЭуВМ
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei NoahтАЩs Ark Lab уБЛуВЙ) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204)
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta уБЛуВЙ) the NLLB team уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta уБЛуВЙ) the NLLB team. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI уБЛуВЙ) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison уБЛуВЙ) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs уБЛуВЙ) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI уБЛуВЙ) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI уБЛуВЙ) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI уБЛуВЙ) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google уБЛуВЙ) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google уБЛуВЙ) Jason Phang, Yao Zhao, and Peter J. Liu уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind уБЛуВЙ) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (ADEPT уБЛуВЙ) Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [blog post](https://www.adept.ai/blog/persimmon-8b)
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT уБЛуВЙ) Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [blog post](https://www.adept.ai/blog/persimmon-8b)
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research уБЛуВЙ) Dat Quoc Nguyen and Anh Tuan Nguyen уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/)
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google уБЛуВЙ) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP уБЛуВЙ) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333)
|
||||
@ -464,6 +470,7 @@ FlaxуАБPyTorchуАБTensorFlowуВТcondaуБзуВдуГ│уВ╣уГИуГ╝уГлуБЩуВЛцЦ╣ц│ХуБпуАБуБЭуВМ
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI уБЛуВЙ) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology уБЛуВЙ), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng уБЛуВЙ) Bo Peng. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [this repo](https://github.com/BlinkDL/RWKV-LM)
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA уБЛуВЙ) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI уБЛуВЙ) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP уБЛуВЙ) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
|
||||
@ -502,7 +509,7 @@ FlaxуАБPyTorchуАБTensorFlowуВТcondaуБзуВдуГ│уВ╣уГИуГ╝уГлуБЩуВЛцЦ╣ц│ХуБпуАБуБЭуВМ
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI уБЛуВЙ) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI уБЛуВЙ) Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527)
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI уБЛуВЙ) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (HUST-VL уБЛуВЙ) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL уБЛуВЙ) Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI уБЛуВЙ) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ: [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141)
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise уБЛуВЙ) Jaehyeon Kim, Jungil Kong, Juhee Son. уБЛуВЙхЕмщЦЛуБХуВМуБЯчаФчй╢шлЦцЦЗ [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
|
||||
17
README_ko.md
17
README_ko.md
@ -18,7 +18,7 @@ limitations under the License.
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||
<br>
|
||||
<p>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
@ -47,7 +47,8 @@ limitations under the License.
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a>
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
@ -287,6 +288,7 @@ Flax, PyTorch, TensorFlow ьДдь╣Ш эОШьЭ┤ьзАьЧРьДЬ ьЭ┤ыУдьЭД condaыбЬ ьДдь╣ШэХШыКФ
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar. ыЕ╝ым╕ъ│╝ эХиъ╗Ш ъ│╡ъ░Ь [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
@ -308,6 +310,7 @@ Flax, PyTorch, TensorFlow ьДдь╣Ш эОШьЭ┤ьзАьЧРьДЬ ьЭ┤ыУдьЭД condaыбЬ ьДдь╣ШэХШыКФ
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce ьЧРьДЬ ьаЬъ│╡)ьЭА Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.ьЭШ [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI ьЧРьДЬ) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever ьЭШ [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia ьЧРьДЬ) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou ьЭШ [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia ьЧРьДЬ) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou ьЭШ [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia ьЧРьДЬ) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei ьЭШ [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
@ -334,6 +337,7 @@ Flax, PyTorch, TensorFlow ьДдь╣Ш эОШьЭ┤ьзАьЧРьДЬ ьЭ┤ыУдьЭД condaыбЬ ьДдь╣ШэХШыКФ
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (NVIDIA ьЧРьДЬ) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ьЭШ [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA ьЧРьДЬ) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro ьЭШ [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research ьЧРьДЬ ьаЬъ│╡)ьЭА Peng Wang, Cheng Da, and Cong Yao.ьЭШ [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed..
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia ьЧРьДЬ) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka ьЭШ [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook ьЧРьДЬ ьаЬъ│╡)ьЭА Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.ьЭШ [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain ьЧРьДЬ) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou ьЭШ [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
@ -351,15 +355,17 @@ Flax, PyTorch, TensorFlow ьДдь╣Ш эОШьЭ┤ьзАьЧРьДЬ ьЭ┤ыУдьЭД condaыбЬ ьДдь╣ШэХШыКФ
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (Huawei NoahтАЩs Ark Lab ьЧРьДЬ) Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu ьЭШ [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (Meta ьЧРьДЬ) the NLLB team ьЭШ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (Meta ьЧРьДЬ ьаЬъ│╡)ьЭА the NLLB team.ьЭШ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI ьЧРьДЬ ьаЬъ│╡)ьЭА Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.ьЭШ [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison ьЧРьДЬ) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh ьЭШ [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs ьЧРьДЬ) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi ьЭШ [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI ьЧРьДЬ) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al ьЭШ [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI ьЧРьДЬ) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby ьЭШ [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI ьЧРьДЬ ьаЬъ│╡)ьЭА Matthias Minderer, Alexey Gritsenko, Neil Houlsby.ьЭШ [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google ьЧРьДЬ) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu ьЭШ [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google ьЧРьДЬ) Jason Phang, Yao Zhao, Peter J. Liu ьЭШ [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind ьЧРьДЬ) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira ьЭШ [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (ADEPT ьЧРьДЬ ьаЬъ│╡)ьЭА Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.ьЭШ [blog post](https://www.adept.ai/blog/persimmon-8b)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (ADEPT ьЧРьДЬ ьаЬъ│╡)ьЭА Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.ьЭШ [blog post](https://www.adept.ai/blog/persimmon-8b)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (VinAI Research ьЧРьДЬ) Dat Quoc Nguyen and Anh Tuan Nguyen ьЭШ [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google ьЧРьДЬ ьаЬъ│╡)ьЭА Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.ьЭШ [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP ьЧРьДЬ) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang ьЭШ [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
@ -379,6 +385,7 @@ Flax, PyTorch, TensorFlow ьДдь╣Ш эОШьЭ┤ьзАьЧРьДЬ ьЭ┤ыУдьЭД condaыбЬ ьДдь╣ШэХШыКФ
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI ьЧРьДЬ) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou ьЭШ [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology ьЧРьДЬ) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu ьЭШ a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng ьЧРьДЬ ьаЬъ│╡)ьЭА Bo Peng.ьЭШ [this repo](https://github.com/BlinkDL/RWKV-LM)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA ьЧРьДЬ) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo ьЭШ [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI ьЧРьДЬ ьаЬъ│╡)ьЭА Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.ьЭШ [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP ьЧРьДЬ) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi ьЭШ [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
@ -417,7 +424,7 @@ Flax, PyTorch, TensorFlow ьДдь╣Ш эОШьЭ┤ьзАьЧРьДЬ ьЭ┤ыУдьЭД condaыбЬ ьДдь╣ШэХШыКФ
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI ьЧРьДЬ) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby ьЭШ [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (Meta AI ьЧРьДЬ ьаЬъ│╡)ьЭА Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.ьЭШ [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (Meta AI ьЧРьДЬ) Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick ьЭШ [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (HUST-VL ьЧРьДЬ ьаЬъ│╡)ьЭА Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.ьЭШ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (HUST-VL ьЧРьДЬ ьаЬъ│╡)ьЭА Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.ьЭШ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (Meta AI ьЧРьДЬ) Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas ьЭШ [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise ьЧРьДЬ ьаЬъ│╡)ьЭА Jaehyeon Kim, Jungil Kong, Juhee Son.ьЭШ [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103)ыЕ╝ым╕ъ│╝ эХиъ╗Ш ы░ЬэСЬэЦИьК╡ыЛИыЛд.
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
|
||||
565
README_pt-br.md
Normal file
565
README_pt-br.md
Normal file
@ -0,0 +1,565 @@
|
||||
<!---
|
||||
Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
|
||||
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
|
||||
</picture>
|
||||
<br/>
|
||||
<br/>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
|
||||
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/docs/transformers/index">
|
||||
<img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/releases">
|
||||
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
|
||||
<img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
|
||||
</a>
|
||||
<a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
|
||||
</p>
|
||||
|
||||
<h4 align="center">
|
||||
<p>
|
||||
<b>English</b> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">чоАф╜Уф╕нцЦЗ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">ч╣БщлФф╕нцЦЗ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">эХЬъ╡ньЦ┤</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ru.md">╨а╤Г╤Б╤Б╨║╨╕╨╣</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_pt-br.md">╨аortugu├кs</a> |
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
<p>Aprendizado de m├бquina de ├║ltima gera├з├гo para JAX, PyTorch e TensorFlow</p>
|
||||
</h3>
|
||||
|
||||
<h3 align="center">
|
||||
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||
</h3>
|
||||
|
||||
|
||||
A biblioteca ЁЯдЧ Transformers oferece milhares de modelos pr├й-treinados para executar tarefas em diferentes modalidades, como texto, vis├гo e ├бudio.
|
||||
|
||||
Esses modelos podem ser aplicados a:
|
||||
|
||||
* ЁЯУЭ Texto, para tarefas como classifica├з├гo de texto, extra├з├гo de informa├з├╡es, resposta a perguntas, sumariza├з├гo, tradu├з├гo, gera├з├гo de texto, em mais de 100 idiomas.
|
||||
* ЁЯЦ╝я╕П Imagens, para tarefas como classifica├з├гo de imagens, detec├з├гo de objetos e segmenta├з├гo.
|
||||
* ЁЯЧгя╕П ├Бudio, para tarefas como reconhecimento de fala e classifica├з├гo de ├бudio.
|
||||
|
||||
Os modelos Transformer tamb├йm podem executar tarefas em diversas modalidades combinadas, como responder a perguntas em tabelas, reconhecimento ├│ptico de caracteres, extra├з├гo de informa├з├╡es de documentos digitalizados, classifica├з├гo de v├нdeo e resposta a perguntas visuais.
|
||||
|
||||
|
||||
A biblioteca ЁЯдЧ Transformers oferece APIs para baixar e usar rapidamente esses modelos pr├й-treinados em um texto espec├нfico, ajust├б-los em seus pr├│prios conjuntos de dados e, em seguida, compartilh├б-los com a comunidade em nosso [model hub](https://huggingface.co/models). Ao mesmo tempo, cada m├│dulo Python que define uma arquitetura ├й totalmente independente e pode ser modificado para permitir experimentos de pesquisa r├бpidos.
|
||||
|
||||
A biblioteca ЁЯдЧ Transformers ├й respaldada pelas tr├кs bibliotecas de aprendizado profundo mais populares тАФ [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) e [TensorFlow](https://www.tensorflow.org/) тАФ com uma integra├з├гo perfeita entre elas. ├Й simples treinar seus modelos com uma delas antes de carreg├б-los para infer├кncia com a outra
|
||||
|
||||
## Demonstra├з├гo Online
|
||||
|
||||
Voc├к pode testar a maioria de nossos modelos diretamente em suas p├бginas a partir do [model hub](https://huggingface.co/models). Tamb├йm oferecemos [hospedagem de modelos privados, versionamento e uma API de infer├кncia](https://huggingface.co/pricing)
|
||||
para modelos p├║blicos e privados.
|
||||
|
||||
Aqui est├гo alguns exemplos:
|
||||
|
||||
Em Processamento de Linguagem Natural:
|
||||
|
||||
- [Completar palavra mascarada com BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
|
||||
- [Reconhecimento de Entidades Nomeadas com Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
|
||||
- [Gera├з├гo de texto com GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C)
|
||||
- [Infer├кncia de Linguagem Natural com RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
|
||||
- [Sumariza├з├гo com BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
|
||||
- [Resposta a perguntas com DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
|
||||
- [Tradu├з├гo com T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
|
||||
|
||||
|
||||
Em Vis├гo Computacional:
|
||||
- [Classifica├з├гo de Imagens com ViT](https://huggingface.co/google/vit-base-patch16-224)
|
||||
- [Detec├з├гo de Objetos com DETR](https://huggingface.co/facebook/detr-resnet-50)
|
||||
- [Segmenta├з├гo Sem├вntica com SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
|
||||
- [Segmenta├з├гo Pan├│ptica com MaskFormer](https://huggingface.co/facebook/maskformer-swin-small-coco)
|
||||
- [Estimativa de Profundidade com DPT](https://huggingface.co/docs/transformers/model_doc/dpt)
|
||||
- [Classifica├з├гo de V├нdeo com VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
|
||||
- [Segmenta├з├гo Universal com OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
|
||||
|
||||
|
||||
Em ├Бudio:
|
||||
- [Reconhecimento Autom├бtico de Fala com Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h)
|
||||
- [Detec├з├гo de Palavras-Chave com Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
|
||||
- [Classifica├з├гo de ├Бudio com Transformer de Espectrograma de ├Бudio](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
|
||||
|
||||
Em Tarefas Multimodais:
|
||||
- [Respostas de Perguntas em Tabelas com TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
|
||||
- [Respostas de Perguntas Visuais com ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
|
||||
- [Classifica├з├гo de Imagens sem Anota├з├гo com CLIP](https://huggingface.co/openai/clip-vit-large-patch14)
|
||||
- [Respostas de Perguntas em Documentos com LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
|
||||
- [Classifica├з├гo de V├нdeo sem Anota├з├гo com X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
|
||||
|
||||
## 100 Projetos Usando Transformers
|
||||
|
||||
Transformers ├й mais do que um conjunto de ferramentas para usar modelos pr├й-treinados: ├й uma comunidade de projetos constru├нdos ao seu redor e o Hugging Face Hub. Queremos que o Transformers permita que desenvolvedores, pesquisadores, estudantes, professores, engenheiros e qualquer outra pessoa construa seus projetos dos sonhos.
|
||||
|
||||
Para celebrar as 100.000 estrelas do Transformers, decidimos destacar a comunidade e criamos a p├бgina [awesome-transformers](./awesome-transformers.md), que lista 100 projetos incr├нveis constru├нdos nas proximidades dos Transformers.
|
||||
|
||||
Se voc├к possui ou utiliza um projeto que acredita que deveria fazer parte da lista, abra um PR para adicion├б-lo!
|
||||
|
||||
## Se voc├к est├б procurando suporte personalizado da equipe Hugging Face
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/support">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
</a><br>
|
||||
|
||||
|
||||
## Tour R├бpido
|
||||
|
||||
Para usar imediatamente um modelo em uma entrada espec├нfica (texto, imagem, ├бudio, ...), oferecemos a API `pipeline`. Os pipelines agrupam um modelo pr├й-treinado com o pr├й-processamento que foi usado durante o treinamento desse modelo. Aqui est├б como usar rapidamente um pipeline para classificar textos como positivos ou negativos:
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
# Carregue o pipeline de classifica├з├гo de texto
|
||||
>>> classifier = pipeline("sentiment-analysis")
|
||||
|
||||
# Classifique o texto como positivo ou negativo
|
||||
>>> classifier("Estamos muito felizes em apresentar o pipeline no reposit├│rio dos transformers.")
|
||||
[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
|
||||
```
|
||||
|
||||
A segunda linha de c├│digo baixa e armazena em cache o modelo pr├й-treinado usado pelo pipeline, enquanto a terceira linha o avalia no texto fornecido. Neste exemplo, a resposta ├й "positiva" com uma confian├зa de 99,97%.
|
||||
|
||||
Muitas tarefas t├кm um `pipeline` pr├й-treinado pronto para uso, n├гo apenas em PNL, mas tamb├йm em vis├гo computacional e processamento de ├бudio. Por exemplo, podemos facilmente extrair objetos detectados em uma imagem:
|
||||
|
||||
``` python
|
||||
>>> import requests
|
||||
>>> from PIL import Image
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# Download an image with cute cats
|
||||
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
|
||||
>>> image_data = requests.get(url, stream=True).raw
|
||||
>>> image = Image.open(image_data)
|
||||
|
||||
# Allocate a pipeline for object detection
|
||||
>>> object_detector = pipeline('object-detection')
|
||||
>>> object_detector(image)
|
||||
[{'score': 0.9982201457023621,
|
||||
'label': 'remote',
|
||||
'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
|
||||
{'score': 0.9960021376609802,
|
||||
'label': 'remote',
|
||||
'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
|
||||
{'score': 0.9954745173454285,
|
||||
'label': 'couch',
|
||||
'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
|
||||
{'score': 0.9988006353378296,
|
||||
'label': 'cat',
|
||||
'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
|
||||
{'score': 0.9986783862113953,
|
||||
'label': 'cat',
|
||||
'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
|
||||
```
|
||||
|
||||
|
||||
Aqui obtemos uma lista de objetos detectados na imagem, com uma caixa envolvendo o objeto e uma pontua├з├гo de confian├зa. Aqui est├б a imagem original ├а esquerda, com as previs├╡es exibidas ├а direita:
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
|
||||
</h3>
|
||||
|
||||
Voc├к pode aprender mais sobre as tarefas suportadas pela API `pipeline` em [este tutorial](https://huggingface.co/docs/transformers/task_summary).
|
||||
|
||||
|
||||
Al├йm do `pipeline`, para baixar e usar qualquer um dos modelos pr├й-treinados em sua tarefa espec├нfica, tudo o que ├й necess├бrio s├гo tr├кs linhas de c├│digo. Aqui est├б a vers├гo em PyTorch:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, AutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
E aqui est├б o c├│digo equivalente para TensorFlow:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, TFAutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
O tokenizador ├й respons├бvel por todo o pr├й-processamento que o modelo pr├й-treinado espera, e pode ser chamado diretamente em uma ├║nica string (como nos exemplos acima) ou em uma lista. Ele produzir├б um dicion├бrio que voc├к pode usar no c├│digo subsequente ou simplesmente passar diretamente para o seu modelo usando o operador de descompacta├з├гo de argumentos **.
|
||||
|
||||
O modelo em si ├й um [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) ou um [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)(dependendo do seu back-end) que voc├к pode usar como de costume. [Este tutorial](https://huggingface.co/docs/transformers/training) explica como integrar esse modelo em um ciclo de treinamento cl├бssico do PyTorch ou TensorFlow, ou como usar nossa API `Trainer` para ajuste fino r├бpido em um novo conjunto de dados.
|
||||
|
||||
## Por que devo usar transformers?
|
||||
|
||||
1. Modelos state-of-the-art f├бceis de usar:
|
||||
- Alto desempenho em compreens├гo e gera├з├гo de linguagem natural, vis├гo computacional e tarefas de ├бudio.
|
||||
- Barreira de entrada baixa para educadores e profissionais.
|
||||
- Poucas abstra├з├╡es vis├нveis para o usu├бrio, com apenas tr├кs classes para aprender.
|
||||
- Uma API unificada para usar todos os nossos modelos pr├й-treinados.
|
||||
|
||||
1. Menores custos de computa├з├гo, menor pegada de carbono:
|
||||
- Pesquisadores podem compartilhar modelos treinados em vez de treinar sempre do zero.
|
||||
- Profissionais podem reduzir o tempo de computa├з├гo e os custos de produ├з├гo.
|
||||
- Dezenas de arquiteturas com mais de 60.000 modelos pr├й-treinados em todas as modalidades.
|
||||
|
||||
1. Escolha o framework certo para cada parte da vida de um modelo:
|
||||
- Treine modelos state-of-the-art em 3 linhas de c├│digo.
|
||||
- Mova um ├║nico modelo entre frameworks TF2.0/PyTorch/JAX ├а vontade.
|
||||
- Escolha o framework certo de forma cont├нnua para treinamento, avalia├з├гo e produ├з├гo.
|
||||
|
||||
1. Personalize facilmente um modelo ou um exemplo para atender ├аs suas necessidades:
|
||||
- Fornecemos exemplos para cada arquitetura para reproduzir os resultados publicados pelos autores originais.
|
||||
- Os detalhes internos do modelo s├гo expostos de maneira consistente.
|
||||
- Os arquivos do modelo podem ser usados de forma independente da biblioteca para experimentos r├бpidos.
|
||||
|
||||
## Por que n├гo devo usar transformers?
|
||||
|
||||
- Esta biblioteca n├гo ├й uma caixa de ferramentas modular para construir redes neurais. O c├│digo nos arquivos do modelo n├гo ├й refatorado com abstra├з├╡es adicionais de prop├│sito, para que os pesquisadores possam iterar rapidamente em cada um dos modelos sem se aprofundar em abstra├з├╡es/arquivos adicionais.
|
||||
- A API de treinamento n├гo ├й projetada para funcionar com qualquer modelo, mas ├й otimizada para funcionar com os modelos fornecidos pela biblioteca. Para loops de aprendizado de m├бquina gen├йricos, voc├к deve usar outra biblioteca (possivelmente, [Accelerate](https://huggingface.co/docs/accelerate)).
|
||||
- Embora nos esforcemos para apresentar o maior n├║mero poss├нvel de casos de uso, os scripts em nossa [pasta de exemplos](https://github.com/huggingface/transformers/tree/main/examples) s├гo apenas isso: exemplos. ├Й esperado que eles n├гo funcionem prontos para uso em seu problema espec├нfico e que seja necess├бrio modificar algumas linhas de c├│digo para adapt├б-los ├аs suas necessidades.
|
||||
|
||||
|
||||
|
||||
### Com pip
|
||||
|
||||
Este reposit├│rio ├й testado no Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ e TensorFlow 2.6+.
|
||||
|
||||
Voc├к deve instalar o ЁЯдЧ Transformers em um [ambiente virtual](https://docs.python.org/3/library/venv.html). Se voc├к n├гo est├б familiarizado com ambientes virtuais em Python, confira o [guia do usu├бrio](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
|
||||
|
||||
Primeiro, crie um ambiente virtual com a vers├гo do Python que voc├к vai usar e ative-o.
|
||||
|
||||
Em seguida, voc├к precisar├б instalar pelo menos um dos back-ends Flax, PyTorch ou TensorFlow.
|
||||
Consulte a [p├бgina de instala├з├гo do TensorFlow](https://www.tensorflow.org/install/), a [p├бgina de instala├з├гo do PyTorch](https://pytorch.org/get-started/locally/#start-locally) e/ou [Flax](https://github.com/google/flax#quick-install) e [Jax](https://github.com/google/jax#installation) p├бginas de instala├з├гo para obter o comando de instala├з├гo espec├нfico para a sua plataforma.
|
||||
|
||||
Quando um desses back-ends estiver instalado, o ЁЯдЧ Transformers pode ser instalado usando pip da seguinte forma:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
Se voc├к deseja experimentar com os exemplos ou precisa da vers├гo mais recente do c├│digo e n├гo pode esperar por um novo lan├зamento, voc├к deve instalar a [biblioteca a partir do c├│digo-fonte](https://huggingface.co/docs/transformers/installation#installing-from-source).
|
||||
|
||||
### Com conda
|
||||
|
||||
Desde a vers├гo v4.0.0 do Transformers, agora temos um canal conda: `huggingface`.
|
||||
|
||||
O ЁЯдЧ Transformers pode ser instalado com conda da seguinte forma:
|
||||
|
||||
```bash
|
||||
conda install -c huggingface transformers
|
||||
```
|
||||
|
||||
Siga as p├бginas de instala├з├гo do Flax, PyTorch ou TensorFlow para ver como instal├б-los com conda.
|
||||
|
||||
Siga as p├бginas de instala├з├гo do Flax, PyTorch ou TensorFlow para ver como instal├б-los com o conda.
|
||||
|
||||
> **_NOTA:_** No Windows, voc├к pode ser solicitado a ativar o Modo de Desenvolvedor para aproveitar o cache. Se isso n├гo for uma op├з├гo para voc├к, por favor nos avise [neste problema](https://github.com/huggingface/huggingface_hub/issues/1062).
|
||||
|
||||
## Arquiteturas de Modelos
|
||||
|
||||
**[Todos os pontos de verifica├з├гo de modelo](https://huggingface.co/models)** fornecidos pelo ЁЯдЧ Transformers s├гo integrados de forma transparente do [model hub](https://huggingface.co/models) do huggingface.co, onde s├гo carregados diretamente por [usu├бrios](https://huggingface.co/users) e [organiza├з├╡es](https://huggingface.co/organizations).
|
||||
|
||||
N├║mero atual de pontos de verifica├з├гo: 
|
||||
|
||||
ЁЯдЧ Transformers atualmente fornece as seguintes arquiteturas (veja [aqui](https://huggingface.co/docs/transformers/model_summary) para um resumo de alto n├нvel de cada uma delas):
|
||||
|
||||
1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
|
||||
1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
|
||||
1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
|
||||
1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
|
||||
1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
|
||||
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
||||
1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from ├Йcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
|
||||
1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
|
||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
|
||||
1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
|
||||
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
|
||||
1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
|
||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Su├бrez*, Yoann Dupont, Laurent Romary, ├Йric Villemonte de la Clergerie, Djam├й Seddah and Beno├оt Sagot.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of G├╢ttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo L├╝ddecke and Alexander Ecker.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozi├иre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J├йr├йmy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre D├йfossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
|
||||
1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
|
||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
|
||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
|
||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
|
||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Herv├й J├йgou.
|
||||
1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
|
||||
1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Kr├дhenb├╝hl.
|
||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||
1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
|
||||
1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timoth├йe Darcet, Th├йo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Herv├й Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
|
||||
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
|
||||
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
|
||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas O─Яuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by Ren├й Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
|
||||
1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
|
||||
1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
|
||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre D├йfossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
|
||||
1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
|
||||
1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
|
||||
1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
|
||||
1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Lo├пc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Beno├оt Crabb├й, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey ├Цhman, Fredrik Carlsson, Magnus Sahlgren.
|
||||
1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo Garc├нa del R├нo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
|
||||
1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
|
||||
1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
|
||||
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
|
||||
1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
|
||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
|
||||
1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Lauren├зon, Lucile Saulnier, L├йo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
|
||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Herv├й J├йgou, Matthijs Douze.
|
||||
1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
|
||||
1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth├йe Lacroix, Baptiste Rozi├иre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
|
||||
1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by J├╢rg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
|
||||
1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
|
||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
|
||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
|
||||
1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
|
||||
1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
|
||||
1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
|
||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre D├йfossez.
|
||||
1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
|
||||
1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei NoahтАЩs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
|
||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
|
||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K├╝ttler, Mike Lewis, Wen-tau Yih, Tim Rockt├дschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, ┼Бukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Doll├бr.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault F├йvry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of W├╝rzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Pawe┼В Krzysztof Nowak, Thomas M├╝ller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
|
||||
1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
|
||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
|
||||
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
|
||||
1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
|
||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
|
||||
1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
|
||||
1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
|
||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzm├бn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||
1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
|
||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [тАЛXLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng,
|
||||
Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
|
||||
1. Quer contribuir com um novo modelo? Adicionamos um **guia detalhado e modelos de exemplo** para orientar voc├к no processo de adi├з├гo de um novo modelo. Voc├к pode encontr├б-los na pasta [`templates`](./templates) do reposit├│rio. Certifique-se de verificar as [diretrizes de contribui├з├гo](./CONTRIBUTING.md) e entrar em contato com os mantenedores ou abrir uma issue para coletar feedback antes de iniciar sua PR.
|
||||
|
||||
Para verificar se cada modelo tem uma implementa├з├гo em Flax, PyTorch ou TensorFlow, ou possui um tokenizador associado com a biblioteca ЁЯдЧ Tokenizers, consulte [esta tabela](https://huggingface.co/docs/transformers/index#supported-frameworks).
|
||||
|
||||
Essas implementa├з├╡es foram testadas em v├бrios conjuntos de dados (veja os scripts de exemplo) e devem corresponder ao desempenho das implementa├з├╡es originais. Voc├к pode encontrar mais detalhes sobre o desempenho na se├з├гo de Exemplos da [documenta├з├гo](https://github.com/huggingface/transformers/tree/main/examples).
|
||||
|
||||
|
||||
## Saiba mais
|
||||
|
||||
| Se├з├гo | Descri├з├гo |
|
||||
|-|-|
|
||||
| [Documenta├з├гo](https://huggingface.co/docs/transformers/) | Documenta├з├гo completa da API e tutoriais |
|
||||
| [Resumo de Tarefas](https://huggingface.co/docs/transformers/task_summary) | Tarefas suportadas pelo ЁЯдЧ Transformers |
|
||||
| [Tutorial de Pr├й-processamento](https://huggingface.co/docs/transformers/preprocessing) | Usando a classe `Tokenizer` para preparar dados para os modelos |
|
||||
| [Treinamento e Ajuste Fino](https://huggingface.co/docs/transformers/training) | Usando os modelos fornecidos pelo ЁЯдЧ Transformers em um loop de treinamento PyTorch/TensorFlow e a API `Trainer` |
|
||||
| [Tour R├бpido: Scripts de Ajuste Fino/Utiliza├з├гo](https://github.com/huggingface/transformers/tree/main/examples) | Scripts de exemplo para ajuste fino de modelos em uma ampla gama de tarefas |
|
||||
| [Compartilhamento e Envio de Modelos](https://huggingface.co/docs/transformers/model_sharing) | Envie e compartilhe seus modelos ajustados com a comunidade |
|
||||
|
||||
## Cita├з├гo
|
||||
|
||||
Agora temos um [artigo](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) que voc├к pode citar para a biblioteca ЁЯдЧ Transformers:
|
||||
```bibtex
|
||||
@inproceedings{wolf-etal-2020-transformers,
|
||||
title = "Transformers: State-of-the-Art Natural Language Processing",
|
||||
author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R├йmi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
|
||||
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
|
||||
month = out,
|
||||
year = "2020",
|
||||
address = "Online",
|
||||
publisher = "Association for Computational Linguistics",
|
||||
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
|
||||
pages = "38--45"
|
||||
}
|
||||
```
|
||||
551
README_ru.md
Normal file
551
README_ru.md
Normal file
@ -0,0 +1,551 @@
|
||||
<!---
|
||||
Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
|
||||
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
|
||||
</picture>
|
||||
<br/>
|
||||
<br/>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
|
||||
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/docs/transformers/index">
|
||||
<img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/releases">
|
||||
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
|
||||
<img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
|
||||
</a>
|
||||
<a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
|
||||
</p>
|
||||
|
||||
<h4 align="center">
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">чоАф╜Уф╕нцЦЗ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">ч╣БщлФф╕нцЦЗ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">эХЬъ╡ньЦ┤</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a> |
|
||||
<b>╨а╤Г╤Б╤Б╨║╨╕╨╣</b>
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
<p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
<p>╨б╨╛╨▓╤А╨╡╨╝╨╡╨╜╨╜╨╛╨╡ ╨╝╨░╤И╨╕╨╜╨╜╨╛╨╡ ╨╛╨▒╤Г╤З╨╡╨╜╨╕╨╡ ╨┤╨╗╤П JAX, PyTorch ╨╕ TensorFlow</p>
|
||||
</h3>
|
||||
|
||||
<h3 align="center">
|
||||
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||
</h3>
|
||||
|
||||
ЁЯдЧ Transformers ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╤В ╤В╤Л╤Б╤П╤З╨╕ ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╨┤╨╗╤П ╨▓╤Л╨┐╨╛╨╗╨╜╨╡╨╜╨╕╤П ╤А╨░╨╖╨╗╨╕╤З╨╜╤Л╤Е ╨╖╨░╨┤╨░╤З, ╤В╨░╨║╨╕╤Е ╨║╨░╨║ ╤В╨╡╨║╤Б╤В, ╨╖╤А╨╡╨╜╨╕╨╡ ╨╕ ╨░╤Г╨┤╨╕╨╛.
|
||||
|
||||
╨н╤В╨╕ ╨╝╨╛╨┤╨╡╨╗╨╕ ╨╝╨╛╨│╤Г╤В ╨▒╤Л╤В╤М ╨┐╤А╨╕╨╝╨╡╨╜╨╡╨╜╤Л ╨║:
|
||||
|
||||
* ЁЯУЭ ╨в╨╡╨║╤Б╤В╤Г ╨┤╨╗╤П ╤В╨░╨║╨╕╤Е ╨╖╨░╨┤╨░╤З, ╨║╨░╨║ ╨║╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╤П ╤В╨╡╨║╤Б╤В╨╛╨▓, ╨╕╨╖╨▓╨╗╨╡╤З╨╡╨╜╨╕╨╡ ╨╕╨╜╤Д╨╛╤А╨╝╨░╤Ж╨╕╨╕, ╨╛╤В╨▓╨╡╤В╤Л ╨╜╨░ ╨▓╨╛╨┐╤А╨╛╤Б╤Л, ╨╛╨▒╨╛╨▒╤Й╨╡╨╜╨╕╨╡, ╨┐╨╡╤А╨╡╨▓╨╛╨┤, ╨│╨╡╨╜╨╡╤А╨░╤Ж╨╕╤П ╤В╨╡╨║╤Б╤В╨╛╨▓ ╨╜╨░ ╨▒╨╛╨╗╨╡╨╡ ╤З╨╡╨╝ 100 ╤П╨╖╤Л╨║╨░╤Е.
|
||||
* ЁЯЦ╝я╕П ╨Ш╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╤П╨╝ ╨┤╨╗╤П ╨╖╨░╨┤╨░╤З ╨║╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╨╕ ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╣, ╨╛╨▒╨╜╨░╤А╤Г╨╢╨╡╨╜╨╕╤П ╨╛╨▒╤К╨╡╨║╤В╨╛╨▓ ╨╕ ╤Б╨╡╨│╨╝╨╡╨╜╤В╨░╤Ж╨╕╨╕.
|
||||
* ЁЯЧгя╕П ╨Р╤Г╨┤╨╕╨╛ ╨┤╨╗╤П ╨╖╨░╨┤╨░╤З ╤А╨░╤Б╨┐╨╛╨╖╨╜╨░╨▓╨░╨╜╨╕╤П ╤А╨╡╤З╨╕ ╨╕ ╨║╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╨╕ ╨░╤Г╨┤╨╕╨╛.
|
||||
|
||||
╨Ь╨╛╨┤╨╡╨╗╨╕ transformers ╤В╨░╨║╨╢╨╡ ╨╝╨╛╨│╤Г╤В ╨▓╤Л╨┐╨╛╨╗╨╜╤П╤В╤М ╨╜╨╡╤Б╨║╨╛╨╗╤М╨║╨╛ ╨╖╨░╨┤╨░╤З, ╤В╨░╨║╨╕╨╡ ╨║╨░╨║ ╨╛╤В╨▓╨╡╤В╤Л ╨╜╨░ ╤В╨░╨▒╨╗╨╕╤З╨╜╤Л╨╡ ╨▓╨╛╨┐╤А╨╛╤Б╤Л, ╤А╨░╤Б╨┐╨╛╨╖╨╜╨░╨▓╨░╨╜╨╕╨╡ ╨╛╨┐╤В╨╕╤З╨╡╤Б╨║╨╕╤Е ╤Б╨╕╨╝╨▓╨╛╨╗╨╛╨▓, ╨╕╨╖╨▓╨╗╨╡╤З╨╡╨╜╨╕╨╡ ╨╕╨╜╤Д╨╛╤А╨╝╨░╤Ж╨╕╨╕ ╨╕╨╖ ╨╛╤В╤Б╨║╨░╨╜╨╕╤А╨╛╨▓╨░╨╜╨╜╤Л╤Е ╨┤╨╛╨║╤Г╨╝╨╡╨╜╤В╨╛╨▓, ╨║╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╤П ╨▓╨╕╨┤╨╡╨╛ ╨╕ ╨╛╤В╨▓╨╡╤В╤Л ╨╜╨░ ╨▓╨╕╨╖╤Г╨░╨╗╤М╨╜╤Л╨╡ ╨▓╨╛╨┐╤А╨╛╤Б╤Л.
|
||||
|
||||
ЁЯдЧ Transformers ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╤В API ╨┤╨╗╤П ╨▒╤Л╤Б╤В╤А╨╛╨╣ ╨╖╨░╨│╤А╤Г╨╖╨║╨╕ ╨╕ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╤П ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣, ╨╕╤Е ╤В╨╛╨╜╨║╨╛╨╣ ╨╜╨░╤Б╤В╤А╨╛╨╣╨║╨╕ ╨╜╨░ ╤Б╨╛╨▒╤Б╤В╨▓╨╡╨╜╨╜╤Л╤Е ╨┤╨░╤В╨░╤Б╨╡╤В╨░╤Е ╨╕ ╨┐╨╛╤Б╨╗╨╡╨┤╤Г╤О╤Й╨╡╨│╨╛ ╨▓╨╖╨░╨╕╨╝╨╛╨┤╨╡╨╣╤Б╤В╨▓╨╕╤П ╨╕╨╝╨╕ ╤Б ╤Б╨╛╨╛╨▒╤Й╨╡╤Б╤В╨▓╨╛╨╝ ╨╜╨░ ╨╜╨░╤И╨╡╨╝ [╤Б╨░╨╣╤В╨╡](https://huggingface.co/models). ╨Т ╤В╨╛ ╨╢╨╡ ╨▓╤А╨╡╨╝╤П ╨║╨░╨╢╨┤╤Л╨╣ python ╨╝╨╛╨┤╤Г╨╗╤М, ╨╛╨┐╤А╨╡╨┤╨╡╨╗╤П╤О╤Й╨╕╨╣ ╨░╤А╤Е╨╕╤В╨╡╨║╤В╤Г╤А╤Г, ╨┐╨╛╨╗╨╜╨╛╤Б╤В╤М╤О ╨░╨▓╤В╨╛╨╜╨╛╨╝╨╡╨╜ ╨╕ ╨╝╨╛╨╢╨╡╤В ╨▒╤Л╤В╤М ╨╝╨╛╨┤╨╕╤Д╨╕╤Ж╨╕╤А╨╛╨▓╨░╨╜ ╨┤╨╗╤П ╨┐╤А╨╛╨▓╨╡╨┤╨╡╨╜╨╕╤П ╨▒╤Л╤Б╤В╤А╤Л╤Е ╨╕╤Б╤Б╨╗╨╡╨┤╨╛╨▓╨░╤В╨╡╨╗╤М╤Б╨║╨╕╤Е ╤Н╨║╤Б╨┐╨╡╤А╨╕╨╝╨╡╨╜╤В╨╛╨▓.
|
||||
|
||||
ЁЯдЧ Transformers ╨╛╨┐╨╕╤А╨░╨╡╤В╤Б╤П ╨╜╨░ ╤В╤А╨╕ ╤Б╨░╨╝╤Л╨╡ ╨┐╨╛╨┐╤Г╨╗╤П╤А╨╜╤Л╨╡ ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╨╕ ╨│╨╗╤Г╨▒╨╛╨║╨╛╨│╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╕╤П - [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) ╨╕ [TensorFlow](https://www.tensorflow.org/) - ╨╕ ╨╗╨╡╨│╨║╨╛ ╨╕╨╜╤В╨╡╨│╤А╨╕╤А╤Г╨╡╤В╤Б╤П ╨╝╨╡╨╢╨┤╤Г ╨╜╨╕╨╝╨╕. ╨н╤В╨╛ ╨┐╨╛╨╖╨▓╨╛╨╗╤П╨╡╤В ╨╗╨╡╨│╨║╨╛ ╨╛╨▒╤Г╤З╨░╤В╤М ╨╝╨╛╨┤╨╡╨╗╨╕ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О ╨╛╨┤╨╜╨╛╨╣ ╨╕╨╖ ╨╜╨╕╤Е, ╨░ ╨╖╨░╤В╨╡╨╝ ╨╖╨░╨│╤А╤Г╨╢╨░╤В╤М ╨╕╤Е ╨┤╨╗╤П ╨▓╤Л╨▓╨╛╨┤╨╛╨▓ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О ╨┤╤А╤Г╨│╨╛╨╣.
|
||||
|
||||
## ╨Ю╨╜╨╗╨░╨╣╨╜ ╨┤╨╡╨╝╨╛╨╜╤Б╤В╤А╨░╤Ж╨╕╤П
|
||||
|
||||
╨С╨╛╨╗╤М╤И╨╕╨╜╤Б╤В╨▓╨╛ ╨╜╨░╤И╨╕╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╨╝╨╛╨╢╨╜╨╛ ╨┐╤А╨╛╤В╨╡╤Б╤В╨╕╤А╨╛╨▓╨░╤В╤М ╨╜╨╡╨┐╨╛╤Б╤А╨╡╨┤╤Б╤В╨▓╨╡╨╜╨╜╨╛ ╨╜╨░ ╨╕╤Е ╤Б╤В╤А╨░╨╜╨╕╤Ж╨░╤Е ╤Б [╤Б╨░╨╣╤В╨░](https://huggingface.co/models). ╨Ь╤Л ╤В╨░╨║╨╢╨╡ ╨┐╤А╨╡╨┤╨╗╨░╨│╨░╨╡╨╝ [╨┐╤А╨╕╨▓╤В╨░╨╜╤Л╨╣ ╤Е╨╛╤Б╤В╨╕╨╜╨│ ╨╝╨╛╨┤╨╡╨╗╨╡╨╣, ╨║╨╛╨╜╤В╤А╨╛╨╗╤М ╨▓╨╡╤А╤Б╨╕╨╣ ╨╕ API ╨┤╨╗╤П ╨▓╤Л╨▓╨╛╨┤╨╛╨▓](https://huggingface.co/pricing) ╨┤╨╗╤П ╨┐╤Г╨▒╨╗╨╕╤З╨╜╤Л╤Е ╨╕ ╤З╨░╤Б╤В╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣.
|
||||
|
||||
╨Т╨╛╤В ╨╜╨╡╤Б╨║╨╛╨╗╤М╨║╨╛ ╨┐╤А╨╕╨╝╨╡╤А╨╛╨▓:
|
||||
|
||||
╨Т ╨╛╨▒╨╗╨░╤Б╤В╨╕ NLP ( ╨Ю╨▒╤А╨░╨▒╨╛╤В╨║╨░ ╤В╨╡╨║╤Б╤В╨╛╨▓ ╨╜╨░ ╨╡╤Б╤В╨╡╤Б╤В╨▓╨╡╨╜╨╜╨╛╨╝ ╤П╨╖╤Л╨║╨╡ ):
|
||||
- [╨Ь╨░╤Б╨║╨╕╤А╨╛╨▓╨░╨╜╨╜╨╛╨╡ ╨╖╨░╨┐╨╛╨╗╨╜╨╡╨╜╨╕╨╡ ╤Б╨╗╨╛╨▓ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
|
||||
- [╨а╨░╤Б╨┐╨╛╨╖╨╜╨░╨▓╨░╨╜╨╕╨╡ ╤Б╤Г╤Й╨╜╨╛╤Б╤В╨╡╨╣ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
|
||||
- [╨У╨╡╨╜╨╡╤А╨░╤Ж╨╕╤П ╤В╨╡╨║╤Б╤В╨░ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
|
||||
- [╨Т╤Л╨▓╨╛╨┤╤Л ╨╜╨░ ╨╡╤Б╤В╨╡╤Б╤В╨▓╨╡╨╜╨╜╨╛╨╝ ╤П╨╖╤Л╨║╨╡ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
|
||||
- [╨Ю╨▒╨╛╨▒╤Й╨╡╨╜╨╕╨╡ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
|
||||
- [╨Ю╤В╨▓╨╡╤В╤Л ╨╜╨░ ╨▓╨╛╨┐╤А╨╛╤Б╤Л ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
|
||||
- [╨Я╨╡╤А╨╡╨▓╨╛╨┤ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
|
||||
|
||||
╨Т ╨╛╨▒╨╗╨░╤Б╤В╨╕ ╨║╨╛╨╝╨┐╤М╤О╤В╨╡╤А╨╜╨╛╨│╨╛ ╨╖╤А╨╡╨╜╨╕╤П:
|
||||
- [╨Ъ╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╤П ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╣ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О ViT](https://huggingface.co/google/vit-base-patch16-224)
|
||||
- [╨Ю╨▒╨╜╨░╤А╤Г╨╢╨╡╨╜╨╕╨╡ ╨╛╨▒╤К╨╡╨║╤В╨╛╨▓ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О DETR](https://huggingface.co/facebook/detr-resnet-50)
|
||||
- [╨б╨╡╨╝╨░╨╜╤В╨╕╤З╨╡╤Б╨║╨░╤П ╤Б╨╡╨│╨╝╨╡╨╜╤В╨░╤Ж╨╕╤П ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
|
||||
- [╨б╨╡╨│╨╝╨╡╨╜╤В╨░╤Ж╨╕╤П ╨┐╨░╨╜╨╛╨┐╤В╨╕╨║╤Г╨╝╨░ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О MaskFormer](https://huggingface.co/facebook/maskformer-swin-small-coco)
|
||||
- [╨Ю╤Ж╨╡╨╜╨║╨░ ╨│╨╗╤Г╨▒╨╕╨╜╤Л ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О DPT](https://huggingface.co/docs/transformers/model_doc/dpt)
|
||||
- [╨Ъ╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╤П ╨▓╨╕╨┤╨╡╨╛ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
|
||||
- [╨г╨╜╨╕╨▓╨╡╤А╤Б╨░╨╗╤М╨╜╨░╤П ╤Б╨╡╨│╨╝╨╡╨╜╤В╨░╤Ж╨╕╤П ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
|
||||
|
||||
╨Т ╨╛╨▒╨╗╨░╤Б╤В╨╕ ╨╖╨▓╤Г╨║╨░:
|
||||
- [╨Р╨▓╤В╨╛╨╝╨░╤В╨╕╤З╨╡╤Б╨║╨╛╨╡ ╤А╨░╤Б╨┐╨╛╨╖╨╜╨░╨▓╨░╨╜╨╕╨╡ ╤А╨╡╤З╨╕ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h)
|
||||
- [╨Я╨╛╨╕╤Б╨║ ╨║╨╗╤О╤З╨╡╨▓╤Л╤Е ╤Б╨╗╨╛╨▓ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
|
||||
- [╨Ъ╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╤П ╨░╤Г╨┤╨╕╨╛╨┤╨░╨╜╨╜╤Л╤Е ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О ╤В╤А╨░╤Б╨╜╤Д╨╛╤А╨╝╨╡╤А╨░ ╨░╤Г╨┤╨╕╨╛╤Б╨┐╨╡╨║╤В╤А╨╛╨│╤А╨░╨╝╨╝](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
|
||||
|
||||
╨Т ╨╝╤Г╨╗╤М╤В╨╕╨╝╨╛╨┤╨░╨╗╤М╨╜╤Л╤Е ╨╖╨░╨┤╨░╤З╨░╤Е:
|
||||
- [╨Ю╤В╨▓╨╡╤В╤Л ╨╜╨░ ╨▓╨╛╨┐╤А╨╛╤Б╤Л ╨┐╨╛ ╤В╨░╨▒╨╗╨╕╤Ж╨╡ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
|
||||
- [╨Т╨╕╨╖╤Г╨░╨╗╤М╨╜╤Л╨╡ ╨╛╤В╨▓╨╡╤В╤Л ╨╜╨░ ╨▓╨╛╨┐╤А╨╛╤Б╤Л ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
|
||||
- [Zero-shot ╨║╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╤П ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╣ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О CLIP](https://huggingface.co/openai/clip-vit-large-patch14)
|
||||
- [╨Ю╤В╨▓╨╡╤В╤Л ╨╜╨░ ╨▓╨╛╨┐╤А╨╛╤Б╤Л ╨┐╨╛ ╨┤╨╛╨║╤Г╨╝╨╡╨╜╤В╨░╨╝ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
|
||||
- [Zero-shot ╨║╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╤П ╨▓╨╕╨┤╨╡╨╛ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
|
||||
|
||||
|
||||
## 100 ╨┐╤А╨╛╨╡╨║╤В╨╛╨▓, ╨╕╤Б╨┐╨╛╨╗╤М╨╖╤Г╤О╤Й╨╕╤Е Transformers
|
||||
|
||||
Transformers - ╤Н╤В╨╛ ╨╜╨╡ ╨┐╤А╨╛╤Б╤В╨╛ ╨╜╨░╨▒╨╛╤А ╨╕╨╜╤Б╤В╤А╤Г╨╝╨╡╨╜╤В╨╛╨▓ ╨┤╨╗╤П ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╤П ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣: ╤Н╤В╨╛ ╤Б╨╛╨╛╨▒╤Й╨╡╤Б╤В╨▓╨╛ ╨┐╤А╨╛╨╡╨║╤В╨╛╨▓, ╤Б╨╛╨╖╨┤╨░╨╜╨╜╨╛╨╡ ╨╜╨░ ╨╡╨│╨╛ ╨╛╤Б╨╜╨╛╨▓╨╡, ╨╕
|
||||
Hugging Face Hub. ╨Ь╤Л ╤Е╨╛╤В╨╕╨╝, ╤З╤В╨╛╨▒╤Л Transformers ╨┐╨╛╨╖╨▓╨╛╨╗╨╕╨╗ ╤А╨░╨╖╤А╨░╨▒╨╛╤В╤З╨╕╨║╨░╨╝, ╨╕╤Б╤Б╨╗╨╡╨┤╨╛╨▓╨░╤В╨╡╨╗╤П╨╝, ╤Б╤В╤Г╨┤╨╡╨╜╤В╨░╨╝, ╨┐╤А╨╛╤Д╨╡╤Б╤Б╨╛╤А╨░╨╝, ╨╕╨╜╨╢╨╡╨╜╨╡╤А╨░╨╝ ╨╕ ╨▓╤Б╨╡╨╝ ╨╢╨╡╨╗╨░╤О╤Й╨╕╨╝
|
||||
╤Б╨╛╨╖╨┤╨░╨▓╨░╤В╤М ╨┐╤А╨╛╨╡╨║╤В╤Л ╤Б╨▓╨╛╨╡╨╣ ╨╝╨╡╤З╤В╤Л.
|
||||
|
||||
╨з╤В╨╛╨▒╤Л ╨╛╤В╨┐╤А╨░╨╖╨┤╨╜╨╛╨▓╨░╤В╤М 100 ╤В╤Л╤Б╤П╤З ╨╖╨▓╨╡╨╖╨┤ Transformers, ╨╝╤Л ╤А╨╡╤И╨╕╨╗╨╕ ╤Б╨┤╨╡╨╗╨░╤В╤М ╨░╨║╤Ж╨╡╨╜╤В ╨╜╨░ ╤Б╨╛╨╛╨▒╤Й╨╡╤Б╤В╨▓╨╡, ╨╕ ╤Б╨╛╨╖╨┤╨░╨╗╨╕ ╤Б╤В╤А╨░╨╜╨╕╤Ж╤Г [awesome-transformers](./awesome-transformers.md), ╨╜╨░ ╨║╨╛╤В╨╛╤А╨╛╨╣ ╨┐╨╡╤А╨╡╤З╨╕╤Б╨╗╨╡╨╜╤Л 100
|
||||
╨╜╨╡╨▓╨╡╤А╨╛╤П╤В╨╜╤Л╤Е ╨┐╤А╨╛╨╡╨║╤В╨╛╨▓, ╤Б╨╛╨╖╨┤╨░╨╜╨╜╤Л╤Е ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О transformers.
|
||||
|
||||
╨Х╤Б╨╗╨╕ ╨▓╤Л ╤П╨▓╨╗╤П╨╡╤В╨╡╤Б╤М ╨▓╨╗╨░╨┤╨╡╨╗╤М╤Ж╨╡╨╝ ╨╕╨╗╨╕ ╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╨╡╨╗╨╡╨╝ ╨┐╤А╨╛╨╡╨║╤В╨░, ╨║╨╛╤В╨╛╤А╤Л╨╣, ╨┐╨╛ ╨▓╨░╤И╨╡╨╝╤Г ╨╝╨╜╨╡╨╜╨╕╤О, ╨┤╨╛╨╗╨╢╨╡╨╜ ╨▒╤Л╤В╤М ╨▓╨║╨╗╤О╤З╨╡╨╜ ╨▓ ╤Н╤В╨╛╤В ╤Б╨┐╨╕╤Б╨╛╨║, ╨┐╨╛╨╢╨░╨╗╤Г╨╣╤Б╤В╨░, ╨╛╤В╨║╤А╨╛╨╣╤В╨╡ PR ╨┤╨╗╤П ╨╡╨│╨╛ ╨┤╨╛╨▒╨░╨▓╨╗╨╡╨╜╨╕╤П!
|
||||
|
||||
## ╨Х╤Б╨╗╨╕ ╨▓╤Л ╤Е╨╛╤В╨╕╤В╨╡ ╨┐╨╛╨╗╤Г╤З╨╕╤В╤М ╨╕╨╜╨┤╨╕╨▓╨╕╨┤╤Г╨░╨╗╤М╨╜╤Г╤О ╨┐╨╛╨┤╨┤╨╡╤А╨╢╨║╤Г ╨╛╤В ╨║╨╛╨╝╨░╨╜╨┤╤Л Hugging Face
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/support">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
</a><br>
|
||||
|
||||
## ╨С╤Л╤Б╤В╤А╤Л╨╣ ╨│╨░╨╣╨┤
|
||||
|
||||
╨Ф╨╗╤П ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╤П ╨╝╨╛╨┤╨╡╨╗╨╕ ╨╜╨░ ╨╖╨░╨┤╨░╨╜╨╜╨╛╨╝ ╨▓╤Е╨╛╨┤╨╡ (╤В╨╡╨║╤Б╤В, ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╡, ╨╖╨▓╤Г╨║, ...) ╨╝╤Л ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╨╝ API `pipeline`. ╨Ъ╨╛╨╜╨▓╨╡╨╣╨╡╤А╤Л ╨╛╨▒╤К╨╡╨┤╨╕╨╜╤П╤О╤В ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Г╤О ╨╝╨╛╨┤╨╡╨╗╤М ╤Б ╨┐╤А╨╡╨┐╤А╨╛╤Ж╨╡╤Б╤Б╨╕╨╜╨│╨╛╨╝, ╨║╨╛╤В╨╛╤А╤Л╨╣ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╗╤Б╤П ╨┐╤А╨╕ ╨╡╨╡ ╨╛╨▒╤Г╤З╨╡╨╜╨╕╨╕. ╨Т╨╛╤В ╨║╨░╨║ ╨╝╨╛╨╢╨╜╨╛ ╨▒╤Л╤Б╤В╤А╨╛ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М ╨║╨╛╨╜╨▓╨╡╨╣╨╡╤А ╨┤╨╗╤П ╨║╨╗╨░╤Б╤Б╨╕╤Д╨╕╨║╨░╤Ж╨╕╨╕ ╨┐╨╛╨╗╨╛╨╢╨╕╤В╨╡╨╗╤М╨╜╤Л╤Е ╨╕ ╨╛╤В╤А╨╕╤Ж╨░╤В╨╡╨╗╤М╨╜╤Л╤Е ╤В╨╡╨║╤Б╤В╨╛╨▓:
|
||||
|
||||
```python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# ╨Т╤Л╨┤╨╡╨╗╨╡╨╜╨╕╨╡ ╨║╨╛╨╜╨▓╨╡╨╣╨╡╤А╨░ ╨┤╨╗╤П ╨░╨╜╨░╨╗╨╕╨╖╨░ ╨╜╨░╤Б╤В╤А╨╛╨╡╨╜╨╕╨╣
|
||||
>>> classifier = pipeline('sentiment-analysis')
|
||||
>>> classifier('╨Ь╤Л ╨╛╤З╨╡╨╜╤М ╤А╨░╨┤╤Л ╨┐╤А╨╡╨┤╤Б╤В╨░╨▓╨╕╤В╤М ╨║╨╛╨╜╨▓╨╡╨╣╨╡╤А ╨▓ transformers.')
|
||||
[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
|
||||
```
|
||||
|
||||
╨Т╤В╨╛╤А╨░╤П ╤Б╤В╤А╨╛╨║╨░ ╨║╨╛╨┤╨░ ╨╖╨░╨│╤А╤Г╨╢╨░╨╡╤В ╨╕ ╨║╤Н╤И╨╕╤А╤Г╨╡╤В ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Г╤О ╨╝╨╛╨┤╨╡╨╗╤М, ╨╕╤Б╨┐╨╛╨╗╤М╨╖╤Г╨╡╨╝╤Г╤О ╨║╨╛╨╜╨▓╨╡╨╣╨╡╤А╨╛╨╝, ╨░ ╤В╤А╨╡╤В╤М╤П ╨╛╤Ж╨╡╨╜╨╕╨▓╨░╨╡╤В ╨╡╨╡ ╨╜╨░ ╨╖╨░╨┤╨░╨╜╨╜╨╛╨╝ ╤В╨╡╨║╤Б╤В╨╡. ╨Ч╨┤╨╡╤Б╤М ╨╛╤В╨▓╨╡╤В "POSITIVE" ╤Б ╤Г╨▓╨╡╤А╨╡╨╜╨╜╨╛╤Б╤В╤М╤О 99,97%.
|
||||
|
||||
╨Т╨╛ ╨╝╨╜╨╛╨│╨╕╤Е ╨╖╨░╨┤╨░╤З╨░╤Е, ╨║╨░╨║ ╨▓ ╨Э╨Ы╨Я, ╤В╨░╨║ ╨╕ ╨▓ ╨║╨╛╨╝╨┐╤М╤О╤В╨╡╤А╨╜╨╛╨╝ ╨╖╤А╨╡╨╜╨╕╨╕ ╨╕ ╤А╨╡╤З╨╕, ╤Г╨╢╨╡ ╨╡╤Б╤В╤М ╨│╨╛╤В╨╛╨▓╤Л╨╣ `pipeline`. ╨Э╨░╨┐╤А╨╕╨╝╨╡╤А, ╨╝╤Л ╨╝╨╛╨╢╨╡╨╝ ╨╗╨╡╨│╨║╨╛ ╨╕╨╖╨▓╨╗╨╡╤З╤М ╨╛╨▒╨╜╨░╤А╤Г╨╢╨╡╨╜╨╜╤Л╨╡ ╨╛╨▒╤К╨╡╨║╤В╤Л ╨╜╨░ ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╕:
|
||||
|
||||
``` python
|
||||
>>> import requests
|
||||
>>> from PIL import Image
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# ╨б╨║╨░╤З╨╕╨▓╨░╨╡╨╝ ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╡ ╤Б ╨╝╨╕╨╗╤Л╨╝╨╕ ╨║╨╛╤В╨╕╨║╨░╨╝╨╕
|
||||
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
|
||||
>>> image_data = requests.get(url, stream=True).raw
|
||||
>>> image = Image.open(image_data)
|
||||
|
||||
# ╨Т╤Л╨┤╨╡╨╗╨╡╨╜╨╕╨╡ ╨║╨╛╨╜╨▓╨╡╨╣╨╡╤А╨░ ╨┤╨╗╤П ╨╛╨▒╨╜╨░╤А╤Г╨╢╨╡╨╜╨╕╤П ╨╛╨▒╤К╨╡╨║╤В╨╛╨▓
|
||||
>>> object_detector = pipeline('object-detection')
|
||||
>>> object_detector(image)
|
||||
[{'score': 0.9982201457023621,
|
||||
'label': 'remote',
|
||||
'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
|
||||
{'score': 0.9960021376609802,
|
||||
'label': 'remote',
|
||||
'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
|
||||
{'score': 0.9954745173454285,
|
||||
'label': 'couch',
|
||||
'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
|
||||
{'score': 0.9988006353378296,
|
||||
'label': 'cat',
|
||||
'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
|
||||
{'score': 0.9986783862113953,
|
||||
'label': 'cat',
|
||||
'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
|
||||
```
|
||||
|
||||
╨Ч╨┤╨╡╤Б╤М ╨╝╤Л ╨┐╨╛╨╗╤Г╤З╨░╨╡╨╝ ╤Б╨┐╨╕╤Б╨╛╨║ ╨╛╨▒╤К╨╡╨║╤В╨╛╨▓, ╨╛╨▒╨╜╨░╤А╤Г╨╢╨╡╨╜╨╜╤Л╤Е ╨╜╨░ ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╕, ╤Б ╤А╨░╨╝╨║╨╛╨╣ ╨▓╨╛╨║╤А╤Г╨│ ╨╛╨▒╤К╨╡╨║╤В╨░ ╨╕ ╨╛╤Ж╨╡╨╜╨║╨╛╨╣ ╨┤╨╛╤Б╤В╨╛╨▓╨╡╤А╨╜╨╛╤Б╤В╨╕. ╨б╨╗╨╡╨▓╨░ - ╨╕╤Б╤Е╨╛╨┤╨╜╨╛╨╡ ╨╕╨╖╨╛╨▒╤А╨░╨╢╨╡╨╜╨╕╨╡, ╤Б╨┐╤А╨░╨▓╨░ ╨┐╤А╨╛╨│╨╜╨╛╨╖╤Л:
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
|
||||
</h3>
|
||||
|
||||
╨Я╨╛╨┤╤А╨╛╨▒╨╜╨╡╨╡ ╨╛ ╨╖╨░╨┤╨░╤З╨░╤Е, ╨┐╨╛╨┤╨┤╨╡╤А╨╢╨╕╨▓╨░╨╡╨╝╤Л╤Е API `pipeline`, ╨╝╨╛╨╢╨╜╨╛ ╤Г╨╖╨╜╨░╤В╤М ╨▓ [╤Н╤В╨╛╨╝ ╤Г╤З╨╡╨▒╨╜╨╛╨╝ ╨┐╨╛╤Б╨╛╨▒╨╕╨╕](https://huggingface.co/docs/transformers/task_sum)
|
||||
|
||||
╨Т ╨┤╨╛╨┐╨╛╨╗╨╜╨╡╨╜╨╕╨╡ ╨║ `pipeline`, ╨┤╨╗╤П ╨╖╨░╨│╤А╤Г╨╖╨║╨╕ ╨╕ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╤П ╨╗╤О╨▒╨╛╨╣ ╨╕╨╖ ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╨▓ ╨╖╨░╨┤╨░╨╜╨╜╨╛╨╣ ╨╖╨░╨┤╨░╤З╨╡ ╨┤╨╛╤Б╤В╨░╤В╨╛╤З╨╜╨╛ ╤В╤А╨╡╤Е ╤Б╤В╤А╨╛╨║ ╨║╨╛╨┤╨░. ╨Т╨╛╤В ╨▓╨╡╤А╤Б╨╕╤П ╨┤╨╗╤П PyTorch:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, AutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("╨Я╤А╨╕╨▓╨╡╤В ╨╝╨╕╤А!", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
╨Р ╨▓╨╛╤В ╤Н╨║╨▓╨╕╨▓╨░╨╗╨╡╨╜╤В╨╜╤Л╨╣ ╨║╨╛╨┤ ╨┤╨╗╤П TensorFlow:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, TFAutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("╨Я╤А╨╕╨▓╨╡╤В ╨╝╨╕╤А!", return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
╨в╨╛╨║╨╡╨╜╨╕╨╖╨░╤В╨╛╤А ╨╛╤В╨▓╨╡╤З╨░╨╡╤В ╨╖╨░ ╨▓╤Б╤О ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╤Г╤О ╨╛╨▒╤А╨░╨▒╨╛╤В╨║╤Г, ╨║╨╛╤В╨╛╤А╤Г╤О ╨╛╨╢╨╕╨┤╨░╨╡╤В ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╨░╤П ╨╝╨╛╨┤╨╡╨╗╤М, ╨╕ ╨╝╨╛╨╢╨╡╤В ╨▒╤Л╤В╤М ╨▓╤Л╨╖╨▓╨░╨╜ ╨╜╨╡╨┐╨╛╤Б╤А╨╡╨┤╤Б╤В╨▓╨╡╨╜╨╜╨╛ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О ╨╛╨┤╨╜╨╛╨╣ ╤Б╤В╤А╨╛╨║╨╕ (╨║╨░╨║ ╨▓ ╨┐╤А╨╕╨▓╨╡╨┤╨╡╨╜╨╜╤Л╤Е ╨▓╤Л╤И╨╡ ╨┐╤А╨╕╨╝╨╡╤А╨░╤Е) ╨╕╨╗╨╕ ╨╜╨░ ╤Б╨┐╨╕╤Б╨║╨╡. ╨Т ╤А╨╡╨╖╤Г╨╗╤М╤В╨░╤В╨╡ ╨▒╤Г╨┤╨╡╤В ╨┐╨╛╨╗╤Г╤З╨╡╨╜ ╤Б╨╗╨╛╨▓╨░╤А╤М, ╨║╨╛╤В╨╛╤А╤Л╨╣ ╨╝╨╛╨╢╨╜╨╛ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М ╨▓ ╨┐╨╛╤Б╨╗╨╡╨┤╤Г╤О╤Й╨╡╨╝ ╨║╨╛╨┤╨╡ ╨╕╨╗╨╕ ╨┐╤А╨╛╤Б╤В╨╛ ╨╜╨░╨┐╤А╤П╨╝╤Г╤О ╨┐╨╡╤А╨╡╨┤╨░╤В╤М ╨▓ ╨╝╨╛╨┤╨╡╨╗╤М ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О ╨╛╨┐╨╡╤А╨░╤В╨╛╤А╨░ ╤А╨░╤Б╨┐╨░╨║╨╛╨▓╨║╨╕ ╨░╤А╨│╤Г╨╝╨╡╨╜╤В╨╛╨▓ **.
|
||||
|
||||
╨б╨░╨╝╨░ ╨╝╨╛╨┤╨╡╨╗╤М ╨┐╤А╨╡╨┤╤Б╤В╨░╨▓╨╗╤П╨╡╤В ╤Б╨╛╨▒╨╛╨╣ ╨╛╨▒╤Л╤З╨╜╤Л╨╣ [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) ╨╕╨╗╨╕ [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (╨▓ ╨╖╨░╨▓╨╕╤Б╨╕╨╝╨╛╤Б╤В╨╕ ╨╛╤В ╨╕╤Б╨┐╨╛╨╗╤М╨╖╤Г╨╡╨╝╨╛╨│╨╛ ╨▒╤Н╨║╨╡╨╜╨┤╨░), ╨║╨╛╤В╨╛╤А╤Л╨╣ ╨╝╨╛╨╢╨╜╨╛ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М ╨║╨░╨║ ╨╛╨▒╤Л╤З╨╜╨╛. [╨Т ╤Н╤В╨╛╨╝ ╤А╤Г╨║╨╛╨▓╨╛╨┤╤Б╤В╨▓╨╡](https://huggingface.co/docs/transformers/training) ╤А╨░╤Б╤Б╨║╨░╨╖╤Л╨▓╨░╨╡╤В╤Б╤П, ╨║╨░╨║ ╨╕╨╜╤В╨╡╨│╤А╨╕╤А╨╛╨▓╨░╤В╤М ╤В╨░╨║╤Г╤О ╨╝╨╛╨┤╨╡╨╗╤М ╨▓ ╨║╨╗╨░╤Б╤Б╨╕╤З╨╡╤Б╨║╨╕╨╣ ╤Ж╨╕╨║╨╗ ╨╛╨▒╤Г╤З╨╡╨╜╨╕╤П PyTorch ╨╕╨╗╨╕ TensorFlow, ╨╕╨╗╨╕ ╨║╨░╨║ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М ╨╜╨░╤И API `Trainer` ╨┤╨╗╤П ╨▒╤Л╤Б╤В╤А╨╛╨╣ ╤В╨╛╨╜╨║╨╛╨╣ ╨╜╨░╤Б╤В╤А╨╛╨╣╨║╨╕ ╨╜╨░ ╨╜╨╛╨▓╨╛╨╝ ╨┤╨░╤В╨░╤Б╨╡╤В╨╡.
|
||||
|
||||
## ╨Я╨╛╤З╨╡╨╝╤Г ╨╜╨╡╨╛╨▒╤Е╨╛╨┤╨╕╨╝╨╛ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М transformers?
|
||||
|
||||
1. ╨Я╤А╨╛╤Б╤В╤Л╨╡ ╨▓ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╨╕ ╤Б╨╛╨▓╤А╨╡╨╝╨╡╨╜╨╜╤Л╨╡ ╨╝╨╛╨┤╨╡╨╗╨╕:
|
||||
- ╨Т╤Л╤Б╨╛╨║╨░╤П ╨┐╤А╨╛╨╕╨╖╨▓╨╛╨┤╨╕╤В╨╡╨╗╤М╨╜╨╛╤Б╤В╤М ╨▓ ╨╖╨░╨┤╨░╤З╨░╤Е ╨┐╨╛╨╜╨╕╨╝╨░╨╜╨╕╤П ╨╕ ╨│╨╡╨╜╨╡╤А╨░╤Ж╨╕╨╕ ╨╡╤Б╤В╨╡╤Б╤В╨▓╨╡╨╜╨╜╨╛╨│╨╛ ╤П╨╖╤Л╨║╨░, ╨║╨╛╨╝╨┐╤М╤О╤В╨╡╤А╨╜╨╛╨│╨╛ ╨╖╤А╨╡╨╜╨╕╤П ╨╕ ╨░╤Г╨┤╨╕╨╛.
|
||||
- ╨Э╨╕╨╖╨║╨╕╨╣ ╨▓╤Е╨╛╨┤╨╜╨╛╨╣ ╨▒╨░╤А╤М╨╡╤А ╨┤╨╗╤П ╨┐╤А╨╡╨┐╨╛╨┤╨░╨▓╨░╤В╨╡╨╗╨╡╨╣ ╨╕ ╨┐╤А╨░╨║╤В╨╕╨║╨╛╨▓.
|
||||
- ╨Э╨╡╨▒╨╛╨╗╤М╤И╨╛╨╡ ╨║╨╛╨╗╨╕╤З╨╡╤Б╤В╨▓╨╛ ╨░╨▒╤Б╤В╤А╨░╨║╤Ж╨╕╨╣ ╨┤╨╗╤П ╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╨╡╨╗╤П ╨╕ ╨▓╤Б╨╡╨│╨╛ ╤В╤А╨╕ ╨║╨╗╨░╤Б╤Б╨░ ╨┤╨╗╤П ╨╕╨╖╤Г╤З╨╡╨╜╨╕╤П.
|
||||
- ╨Х╨┤╨╕╨╜╤Л╨╣ API ╨┤╨╗╤П ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╤П ╨▓╤Б╨╡╤Е ╨╜╨░╤И╨╕╤Е ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣.
|
||||
|
||||
1. ╨С╨╛╨╗╨╡╨╡ ╨╜╨╕╨╖╨║╨╕╨╡ ╨▓╤Л╤З╨╕╤Б╨╗╨╕╤В╨╡╨╗╤М╨╜╤Л╨╡ ╨╖╨░╤В╤А╨░╤В╤Л, ╨╝╨╡╨╜╤М╤И╨╕╨╣ "╤Г╨│╨╗╨╡╤А╨╛╨┤╨╜╤Л╨╣ ╤Б╨╗╨╡╨┤":
|
||||
- ╨Ш╤Б╤Б╨╗╨╡╨┤╨╛╨▓╨░╤В╨╡╨╗╨╕ ╨╝╨╛╨│╤Г╤В ╨╛╨▒╨╝╨╡╨╜╨╕╨▓╨░╤В╤М╤Б╤П ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Л╨╝╨╕ ╨╝╨╛╨┤╨╡╨╗╤П╨╝╨╕ ╨▓╨╝╨╡╤Б╤В╨╛ ╤В╨╛╨│╨╛, ╤З╤В╨╛╨▒╤Л ╨┐╨╛╤Б╤В╨╛╤П╨╜╨╜╨╛ ╨╕╤Е ╨┐╨╡╤А╨╡╨╛╨▒╤Г╤З╨░╤В╤М.
|
||||
- ╨Я╤А╨░╨║╤В╨╕╨║╨╕ ╨╝╨╛╨│╤Г╤В ╤Б╨╛╨║╤А╨░╤В╨╕╤В╤М ╨▓╤А╨╡╨╝╤П ╨▓╤Л╤З╨╕╤Б╨╗╨╡╨╜╨╕╨╣ ╨╕ ╨┐╤А╨╛╨╕╨╖╨▓╨╛╨┤╤Б╤В╨▓╨╡╨╜╨╜╤Л╨╡ ╨╖╨░╤В╤А╨░╤В╤Л.
|
||||
- ╨Ф╨╡╤Б╤П╤В╨║╨╕ ╨░╤А╤Е╨╕╤В╨╡╨║╤В╤Г╤А ╤Б ╨▒╨╛╨╗╨╡╨╡ ╤З╨╡╨╝ 60 000 ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╨┤╨╗╤П ╨▓╤Б╨╡╤Е ╨╝╨╛╨┤╨░╨╗╤М╨╜╨╛╤Б╤В╨╡╨╣.
|
||||
|
||||
1. ╨Т╤Л╨▒╨╛╤А ╨┐╨╛╨┤╤Е╨╛╨┤╤П╤Й╨╡╨│╨╛ ╤Д╤А╨╡╨╣╨╝╨▓╨╛╤А╨║╨░ ╨┤╨╗╤П ╨║╨░╨╢╨┤╨╛╨│╨╛ ╤Н╤В╨░╨┐╨░ ╨╢╨╕╨╖╨╜╨╕ ╨╝╨╛╨┤╨╡╨╗╨╕:
|
||||
- ╨Ю╨▒╤Г╤З╨╡╨╜╨╕╨╡ ╤Б╨░╨╝╤Л╤Е ╤Б╨╛╨▓╤А╨╡╨╝╨╡╨╜╨╜╤Л╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╨╖╨░ 3 ╤Б╤В╤А╨╛╨║╨╕ ╨║╨╛╨┤╨░.
|
||||
- ╨Я╨╡╤А╨╡╨╝╨╡╤Й╨░╨╣╤В╨╡ ╨╛╨┤╨╜╤Г ╨╝╨╛╨┤╨╡╨╗╤М ╨╝╨╡╨╢╨┤╤Г ╤Д╤А╨╡╨╣╨╝╨▓╨╛╤А╨║╨░╨╝╨╕ TF2.0/PyTorch/JAX ╨┐╨╛ ╤Б╨▓╨╛╨╡╨╝╤Г ╤Г╤Б╨╝╨╛╤В╤А╨╡╨╜╨╕╤О.
|
||||
- ╨С╨╡╤Б╨┐╤А╨╡╨┐╤П╤В╤Б╤В╨▓╨╡╨╜╨╜╤Л╨╣ ╨▓╤Л╨▒╨╛╤А ╨┐╨╛╨┤╤Е╨╛╨┤╤П╤Й╨╡╨│╨╛ ╤Д╤А╨╡╨╣╨╝╨▓╨╛╤А╨║╨░ ╨┤╨╗╤П ╨╛╨▒╤Г╤З╨╡╨╜╨╕╤П, ╨╛╤Ж╨╡╨╜╨║╨╕ ╨╕ ╨┐╤А╨╛╨╕╨╖╨▓╨╛╨┤╤Б╤В╨▓╨░.
|
||||
|
||||
1. ╨Ы╨╡╨│╨║╨╛ ╨╜╨░╤Б╤В╤А╨╛╨╕╤В╤М ╨╝╨╛╨┤╨╡╨╗╤М ╨╕╨╗╨╕ ╨┐╤А╨╕╨╝╨╡╤А ╨┐╨╛╨┤ ╤Б╨▓╨╛╨╕ ╨╜╤Г╨╢╨┤╤Л:
|
||||
- ╨Ь╤Л ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╨╝ ╨┐╤А╨╕╨╝╨╡╤А╤Л ╨┤╨╗╤П ╨║╨░╨╢╨┤╨╛╨╣ ╨░╤А╤Е╨╕╤В╨╡╨║╤В╤Г╤А╤Л, ╤З╤В╨╛╨▒╤Л ╨▓╨╛╤Б╨┐╤А╨╛╨╕╨╖╨▓╨╡╤Б╤В╨╕ ╤А╨╡╨╖╤Г╨╗╤М╤В╨░╤В╤Л, ╨╛╨┐╤Г╨▒╨╗╨╕╨║╨╛╨▓╨░╨╜╨╜╤Л╨╡ ╨╕╤Е ╨░╨▓╤В╨╛╤А╨░╨╝╨╕.
|
||||
- ╨Т╨╜╤Г╤В╤А╨╡╨╜╨╜╨╕╨╡ ╨║╨╛╨╝╨┐╨╛╨╜╨╡╨╜╤В╤Л ╨╝╨╛╨┤╨╡╨╗╨╕ ╤А╨░╤Б╨║╤А╤Л╨▓╨░╤О╤В╤Б╤П ╨╝╨░╨║╤Б╨╕╨╝╨░╨╗╤М╨╜╨╛ ╨┐╨╛╤Б╨╗╨╡╨┤╨╛╨▓╨░╤В╨╡╨╗╤М╨╜╨╛.
|
||||
- ╨д╨░╨╣╨╗╤Л ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╨╝╨╛╨╢╨╜╨╛ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М ╨╜╨╡╨╖╨░╨▓╨╕╤Б╨╕╨╝╨╛ ╨╛╤В ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╨╕ ╨┤╨╗╤П ╨┐╤А╨╛╨▓╨╡╨┤╨╡╨╜╨╕╤П ╨▒╤Л╤Б╤В╤А╤Л╤Е ╤Н╨║╤Б╨┐╨╡╤А╨╕╨╝╨╡╨╜╤В╨╛╨▓.
|
||||
|
||||
## ╨Я╨╛╤З╨╡╨╝╤Г ╤П ╨╜╨╡ ╨┤╨╛╨╗╨╢╨╡╨╜ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М transformers?
|
||||
|
||||
- ╨Ф╨░╨╜╨╜╨░╤П ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╨░ ╨╜╨╡ ╤П╨▓╨╗╤П╨╡╤В╤Б╤П ╨╝╨╛╨┤╤Г╨╗╤М╨╜╤Л╨╝ ╨╜╨░╨▒╨╛╤А╨╛╨╝ ╤Б╤В╤А╨╛╨╕╤В╨╡╨╗╤М╨╜╤Л╤Е ╨▒╨╗╨╛╨║╨╛╨▓ ╨┤╨╗╤П ╨╜╨╡╨╣╤А╨╛╨╜╨╜╤Л╤Е ╤Б╨╡╤В╨╡╨╣. ╨Ъ╨╛╨┤ ╨▓ ╤Д╨░╨╣╨╗╨░╤Е ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╤Б╨┐╨╡╤Ж╨╕╨░╨╗╤М╨╜╨╛ ╨╜╨╡ ╤А╨╡╤Д╨░╨║╤В╨╛╤А╨╕╤В╤Б╤П ╨┤╨╛╨┐╨╛╨╗╨╜╨╕╤В╨╡╨╗╤М╨╜╤Л╨╝╨╕ ╨░╨▒╤Б╤В╤А╨░╨║╤Ж╨╕╤П╨╝╨╕, ╤З╤В╨╛╨▒╤Л ╨╕╤Б╤Б╨╗╨╡╨┤╨╛╨▓╨░╤В╨╡╨╗╨╕ ╨╝╨╛╨│╨╗╨╕ ╨▒╤Л╤Б╤В╤А╨╛ ╨╕╤В╨╡╤А╨░╤В╨╕╨▓╨╜╨╛ ╤А╨░╨▒╨╛╤В╨░╤В╤М ╤Б ╨║╨░╨╢╨┤╨╛╨╣ ╨╕╨╖ ╨╝╨╛╨┤╨╡╨╗╨╡╨╣, ╨╜╨╡ ╨┐╨╛╨│╤А╤Г╨╢╨░╤П╤Б╤М ╨▓ ╨┤╨╛╨┐╨╛╨╗╨╜╨╕╤В╨╡╨╗╤М╨╜╤Л╨╡ ╨░╨▒╤Б╤В╤А╨░╨║╤Ж╨╕╨╕/╤Д╨░╨╣╨╗╤Л.
|
||||
- API ╨╛╨▒╤Г╤З╨╡╨╜╨╕╤П ╨╜╨╡ ╨┐╤А╨╡╨┤╨╜╨░╨╖╨╜╨░╤З╨╡╨╜ ╨┤╨╗╤П ╤А╨░╨▒╨╛╤В╤Л ╤Б ╨╗╤О╨▒╨╛╨╣ ╨╝╨╛╨┤╨╡╨╗╤М╤О, ╨░ ╨╛╨┐╤В╨╕╨╝╨╕╨╖╨╕╤А╨╛╨▓╨░╨╜ ╨┤╨╗╤П ╤А╨░╨▒╨╛╤В╤Л ╤Б ╨╝╨╛╨┤╨╡╨╗╤П╨╝╨╕, ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╨╝╤Л╨╝╨╕ ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╨╛╨╣. ╨Ф╨╗╤П ╤А╨░╨▒╨╛╤В╤Л ╤Б ╨╛╨▒╤Й╨╕╨╝╨╕ ╤Ж╨╕╨║╨╗╨░╨╝╨╕ ╨╝╨░╤И╨╕╨╜╨╜╨╛╨│╨╛ ╨╛╨▒╤Г╤З╨╡╨╜╨╕╤П ╤Б╨╗╨╡╨┤╤Г╨╡╤В ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М ╨┤╤А╤Г╨│╤Г╤О ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╤Г (╨▓╨╛╨╖╨╝╨╛╨╢╨╜╨╛, [Accelerate](https://huggingface.co/docs/accelerate)).
|
||||
- ╨Э╨╡╤Б╨╝╨╛╤В╤А╤П ╨╜╨░ ╤В╨╛, ╤З╤В╨╛ ╨╝╤Л ╤Б╤В╤А╨╡╨╝╨╕╨╝╤Б╤П ╨┐╤А╨╡╨┤╤Б╤В╨░╨▓╨╕╤В╤М ╨║╨░╨║ ╨╝╨╛╨╢╨╜╨╛ ╨▒╨╛╨╗╤М╤И╨╡ ╨┐╤А╨╕╨╝╨╡╤А╨╛╨▓ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╤П, ╤Б╨║╤А╨╕╨┐╤В╤Л ╨▓ ╨╜╨░╤И╨╡╨╣ ╨┐╨░╨┐╨║╨╡ [╨┐╤А╨╕╨╝╨╡╤А╨╛╨▓](https://github.com/huggingface/transformers/tree/main/examples) ╤П╨▓╨╗╤П╤О╤В╤Б╤П ╨╕╨╝╨╡╨╜╨╜╨╛ ╨┐╤А╨╕╨╝╨╡╤А╨░╨╝╨╕. ╨Я╤А╨╡╨┤╨┐╨╛╨╗╨░╨│╨░╨╡╤В╤Б╤П, ╤З╤В╨╛ ╨╛╨╜╨╕ ╨╜╨╡ ╨▒╤Г╨┤╤Г╤В ╤А╨░╨▒╨╛╤В╨░╤В╤М "╨╕╨╖ ╨║╨╛╤А╨╛╨▒╨║╨╕" ╨┤╨╗╤П ╤А╨╡╤И╨╡╨╜╨╕╤П ╨▓╨░╤И╨╡╨╣ ╨║╨╛╨╜╨║╤А╨╡╤В╨╜╨╛╨╣ ╨╖╨░╨┤╨░╤З╨╕, ╨╕ ╨▓╨░╨╝ ╨┐╤А╨╕╨┤╨╡╤В╤Б╤П ╨╕╨╖╨╝╨╡╨╜╨╕╤В╤М ╨╜╨╡╤Б╨║╨╛╨╗╤М╨║╨╛ ╤Б╤В╤А╨╛╨║ ╨║╨╛╨┤╨░, ╤З╤В╨╛╨▒╤Л ╨░╨┤╨░╨┐╤В╨╕╤А╨╛╨▓╨░╤В╤М ╨╕╤Е ╨┐╨╛╨┤ ╤Б╨▓╨╛╨╕ ╨╜╤Г╨╢╨┤╤Л.
|
||||
|
||||
## ╨г╤Б╤В╨░╨╜╨╛╨▓╨║╨░
|
||||
|
||||
### ╨б ╨┐╨╛╨╝╨╛╤Й╤М╤О pip
|
||||
|
||||
╨Ф╨░╨╜╨╜╤Л╨╣ ╤А╨╡╨┐╨╛╨╖╨╕╤В╨╛╤А╨╕╨╣ ╨┐╤А╨╛╤В╨╡╤Б╤В╨╕╤А╨╛╨▓╨░╨╜ ╨╜╨░ Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ ╨╕ TensorFlow 2.6+.
|
||||
|
||||
╨г╤Б╤В╨░╨╜╨░╨▓╨╗╨╕╨▓╨░╤В╤М ЁЯдЧ Transformers ╤Б╨╗╨╡╨┤╤Г╨╡╤В ╨▓ [╨▓╨╕╤А╤В╤Г╨░╨╗╤М╨╜╨╛╨╣ ╤Б╤А╨╡╨┤╨╡](https://docs.python.org/3/library/venv.html). ╨Х╤Б╨╗╨╕ ╨▓╤Л ╨╜╨╡ ╨╖╨╜╨░╨║╨╛╨╝╤Л ╤Б ╨▓╨╕╤А╤В╤Г╨░╨╗╤М╨╜╤Л╨╝╨╕ ╤Б╤А╨╡╨┤╨░╨╝╨╕ Python, ╨╛╨╖╨╜╨░╨║╨╛╨╝╤М╤В╨╡╤Б╤М ╤Б [╤А╤Г╨║╨╛╨▓╨╛╨┤╤Б╤В╨▓╨╛╨╝ ╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╨╡╨╗╤П](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
|
||||
|
||||
╨б╨╜╨░╤З╨░╨╗╨░ ╤Б╨╛╨╖╨┤╨░╨╣╤В╨╡ ╨▓╨╕╤А╤В╤Г╨░╨╗╤М╨╜╤Г╤О ╤Б╤А╨╡╨┤╤Г ╤Б ╤В╨╛╨╣ ╨▓╨╡╤А╤Б╨╕╨╡╨╣ Python, ╨║╨╛╤В╨╛╤А╤Г╤О ╨▓╤Л ╤Б╨╛╨▒╨╕╤А╨░╨╡╤В╨╡╤Б╤М ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М, ╨╕ ╨░╨║╤В╨╕╨▓╨╕╤А╤Г╨╣╤В╨╡ ╨╡╨╡.
|
||||
|
||||
╨Ч╨░╤В╨╡╨╝ ╨╜╨╡╨╛╨▒╤Е╨╛╨┤╨╕╨╝╨╛ ╤Г╤Б╤В╨░╨╜╨╛╨▓╨╕╤В╤М ╤Е╨╛╤В╤П ╨▒╤Л ╨╛╨┤╨╕╨╜ ╨▒╨╡╨║╨╡╨╜╨┤ ╨╕╨╖ Flax, PyTorch ╨╕╨╗╨╕ TensorFlow.
|
||||
╨Я╨╛╨╢╨░╨╗╤Г╨╣╤Б╤В╨░, ╨╛╨▒╤А╨░╤В╨╕╤В╨╡╤Б╤М ╨║ ╤Б╤В╤А╨░╨╜╨╕╤Ж╨░╨╝ [TensorFlow ╤Г╤Б╤В╨░╨╜╨╛╨▓╨╛╤З╨╜╨░╤П ╤Б╤В╤А╨░╨╜╨╕╤Ж╨░](https://www.tensorflow.org/install/), [PyTorch ╤Г╤Б╤В╨░╨╜╨╛╨▓╨╛╤З╨╜╨░╤П ╤Б╤В╤А╨░╨╜╨╕╤Ж╨░](https://pytorch.org/get-started/locally/#start-locally) ╨╕/╨╕╨╗╨╕ [Flax](https://github.com/google/flax#quick-install) ╨╕ [Jax](https://github.com/google/jax#installation), ╨│╨┤╨╡ ╨╛╨┐╨╕╤Б╨░╨╜╤Л ╨║╨╛╨╝╨░╨╜╨┤╤Л ╤Г╤Б╤В╨░╨╜╨╛╨▓╨║╨╕ ╨┤╨╗╤П ╨▓╨░╤И╨╡╨╣ ╨┐╨╗╨░╤В╤Д╨╛╤А╨╝╤Л.
|
||||
|
||||
╨Я╨╛╤Б╨╗╨╡ ╤Г╤Б╤В╨░╨╜╨╛╨▓╨║╨╕ ╨╛╨┤╨╜╨╛╨│╨╛ ╨╕╨╖ ╤Н╤В╨╕╤Е ╨▒╤Н╨║╨╡╨╜╨┤╨╛╨▓ ЁЯдЧ Transformers ╨╝╨╛╨╢╨╡╤В ╨▒╤Л╤В╤М ╤Г╤Б╤В╨░╨╜╨╛╨▓╨╗╨╡╨╜ ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О pip ╤Б╨╗╨╡╨┤╤Г╤О╤Й╨╕╨╝ ╨╛╨▒╤А╨░╨╖╨╛╨╝:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
╨Х╤Б╨╗╨╕ ╨▓╤Л ╤Е╨╛╤В╨╕╤В╨╡ ╨┐╨╛╨╕╨│╤А╨░╤В╤М ╤Б ╨┐╤А╨╕╨╝╨╡╤А╨░╨╝╨╕ ╨╕╨╗╨╕ ╨▓╨░╨╝ ╨╜╤Г╨╢╨╡╨╜ ╤Б╨░╨╝╤Л╨╣ ╤Б╨╛╨▓╤А╨╡╨╝╨╡╨╜╨╜╤Л╨╣ ╨║╨╛╨┤ ╨╕ ╨▓╤Л ╨╜╨╡ ╨╝╨╛╨╢╨╡╤В╨╡ ╨╢╨┤╨░╤В╤М ╨╜╨╛╨▓╨╛╨│╨╛ ╤А╨╡╨╗╨╕╨╖╨░, ╨▓╤Л ╨┤╨╛╨╗╨╢╨╜╤Л [╤Г╤Б╤В╨░╨╜╨╛╨▓╨╕╤В╤М ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╤Г ╨╕╨╖ ╨╕╤Б╤Е╨╛╨┤╨╜╨╛╨│╨╛ ╨║╨╛╨┤╨░](https://huggingface.co/docs/transformers/installation#installing-from-source).
|
||||
|
||||
### ╨б ╨┐╨╛╨╝╨╛╤Й╤М╤О conda
|
||||
|
||||
╨Э╨░╤З╨╕╨╜╨░╤П ╤Б ╨▓╨╡╤А╤Б╨╕╨╕ Transformers v4.0.0, ╤Г ╨╜╨░╤Б ╨┐╨╛╤П╨▓╨╕╨╗╤Б╨░╤Б╤М ╨┐╨╛╨┤╨┤╨╡╤А╨╢╨║╨░ conda: `huggingface`.
|
||||
|
||||
╨г╤Б╤В╨░╨╜╨╛╨▓╨╕╤В╤М Transformers ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О conda ╨╝╨╛╨╢╨╜╨╛ ╤Б╨╗╨╡╨┤╤Г╤О╤Й╨╕╨╝ ╨╛╨▒╤А╨░╨╖╨╛╨╝:
|
||||
|
||||
```bash
|
||||
conda install -c huggingface transformers
|
||||
```
|
||||
|
||||
╨Ю ╤В╨╛╨╝, ╨║╨░╨║ ╤Г╤Б╤В╨░╨╜╨╛╨▓╨╕╤В╤М Flax, PyTorch ╨╕╨╗╨╕ TensorFlow ╤Б ╨┐╨╛╨╝╨╛╤Й╤М╤О conda, ╤З╨╕╤В╨░╨╣╤В╨╡ ╨╜╨░ ╤Б╤В╤А╨░╨╜╨╕╤Ж╨░╤Е, ╨┐╨╛╤Б╨▓╤П╤Й╨╡╨╜╨╜╤Л╤Е ╨╕╤Е ╤Г╤Б╤В╨░╨╜╨╛╨▓╨║╨╡.
|
||||
|
||||
> **_╨Ч╨Р╨Ь╨Х╨в╨Ъ╨Р:_** ╨Т ╨╛╨┐╨╡╤А╨░╤Ж╨╕╨╛╨╜╨╜╨╛╨╣ ╤Б╨╕╤Б╤В╨╡╨╝╨╡ Windows ╨▓╨░╨╝ ╨╝╨╛╨╢╨╡╤В ╨▒╤Л╤В╤М ╨┐╤А╨╡╨┤╨╗╨╛╨╢╨╡╨╜╨╛ ╨░╨║╤В╨╕╨▓╨╕╤А╨╛╨▓╨░╤В╤М ╤А╨╡╨╢╨╕╨╝ ╤А╨░╨╖╤А╨░╨▒╨╛╤В╤З╨╕╨║╨░, ╤З╤В╨╛╨▒╤Л ╨▓╨╛╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╤М╤Б╤П ╨┐╤А╨╡╨╕╨╝╤Г╤Й╨╡╤Б╤В╨▓╨░╨╝╨╕ ╨║╤Н╤И╨╕╤А╨╛╨▓╨░╨╜╨╕╤П. ╨Х╤Б╨╗╨╕ ╨┤╨╗╤П ╨▓╨░╤Б ╤Н╤В╨╛ ╨╜╨╡╨▓╨╛╨╖╨╝╨╛╨╢╨╜╨╛, ╤Б╨╛╨╛╨▒╤Й╨╕╤В╨╡ ╨╜╨░╨╝ ╨╛╨▒ ╤Н╤В╨╛╨╝ [╨╖╨┤╨╡╤Б╤М](https://github.com/huggingface/huggingface_hub/issues/1062).
|
||||
|
||||
## ╨Ь╨╛╨┤╨╡╨╗╤М╨╜╤Л╨╡ ╨░╤А╤Е╨╕╤В╨╡╨║╤В╤Г╤А╤Л
|
||||
|
||||
**[╨Т╤Б╨╡ ╨║╨╛╨╜╤В╤А╨╛╨╗╤М╨╜╤Л╨╡ ╤В╨╛╤З╨║╨╕ ╨╝╨╛╨┤╨╡╨╗╨╡╨╣](https://huggingface.co/models)**, ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╨╝╤Л╨╡ ЁЯдЧ Transformers, ╨▒╨╡╤Б╨┐╤А╨╡╨┐╤П╤В╤Б╤В╨▓╨╡╨╜╨╜╨╛ ╨╕╨╜╤В╨╡╨│╤А╨╕╤А╤Г╤О╤В╤Б╤П ╤Б huggingface.co [model hub](https://huggingface.co/models), ╨║╤Г╨┤╨░ ╨╛╨╜╨╕ ╨╖╨░╨│╤А╤Г╨╢╨░╤О╤В╤Б╤П ╨╜╨╡╨┐╨╛╤Б╤А╨╡╨┤╤Б╤В╨▓╨╡╨╜╨╜╨╛ [╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╤В╨╡╨╗╤П╨╝╨╕](https://huggingface.co/users) ╨╕ [╨╛╤А╨│╨░╨╜╨╕╨╖╨░╤Ж╨╕╤П╨╝╨╕](https://huggingface.co/organizations).
|
||||
|
||||
╨в╨╡╨║╤Г╤Й╨╡╨╡ ╨║╨╛╨╗╨╕╤З╨╡╤Б╤В╨▓╨╛ ╨║╨╛╨╜╤В╤А╨╛╨╗╤М╨╜╤Л╤Е ╤В╨╛╤З╨╡╨║: 
|
||||
|
||||
ЁЯдЧ ╨Т ╨╜╨░╤Б╤В╨╛╤П╤Й╨╡╨╡ ╨▓╤А╨╡╨╝╤П Transformers ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╤В ╤Б╨╗╨╡╨┤╤Г╤О╤Й╨╕╨╡ ╨░╤А╤Е╨╕╤В╨╡╨║╤В╤Г╤А╤Л (╨┐╨╛╨┤╤А╨╛╨▒╨╜╨╛╨╡ ╨╛╨┐╨╕╤Б╨░╨╜╨╕╨╡ ╨║╨░╨╢╨┤╨╛╨╣ ╨╕╨╖ ╨╜╨╕╤Е ╤Б╨╝. [╨╖╨┤╨╡╤Б╤М](https://huggingface.co/docs/transformers/model_summary)):
|
||||
|
||||
1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
|
||||
1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
|
||||
1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
|
||||
1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
|
||||
1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
|
||||
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
||||
1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from ├Йcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
|
||||
1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
|
||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
|
||||
1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
|
||||
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
|
||||
1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
|
||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Su├бrez*, Yoann Dupont, Laurent Romary, ├Йric Villemonte de la Clergerie, Djam├й Seddah and Beno├оt Sagot.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of G├╢ttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo L├╝ddecke and Alexander Ecker.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozi├иre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J├йr├йmy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre D├йfossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
|
||||
1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
|
||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
|
||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
|
||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
|
||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Herv├й J├йgou.
|
||||
1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
|
||||
1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Kr├дhenb├╝hl.
|
||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||
1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
|
||||
1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timoth├йe Darcet, Th├йo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Herv├й Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
|
||||
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
|
||||
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
|
||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas O─Яuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by Ren├й Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
|
||||
1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
|
||||
1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
|
||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre D├йfossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
|
||||
1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
|
||||
1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
|
||||
1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
|
||||
1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Lo├пc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Beno├оt Crabb├й, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey ├Цhman, Fredrik Carlsson, Magnus Sahlgren.
|
||||
1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo Garc├нa del R├нo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
|
||||
1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
|
||||
1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
|
||||
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
|
||||
1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
|
||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
|
||||
1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Lauren├зon, Lucile Saulnier, L├йo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
|
||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Herv├й J├йgou, Matthijs Douze.
|
||||
1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
|
||||
1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth├йe Lacroix, Baptiste Rozi├иre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
|
||||
1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by J├╢rg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
|
||||
1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
|
||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
|
||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
|
||||
1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
|
||||
1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
|
||||
1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
|
||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre D├йfossez.
|
||||
1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
|
||||
1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei NoahтАЩs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
|
||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
|
||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K├╝ttler, Mike Lewis, Wen-tau Yih, Tim Rockt├дschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, ┼Бukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Doll├бr.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault F├йvry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of W├╝rzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Pawe┼В Krzysztof Nowak, Thomas M├╝ller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
|
||||
1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
|
||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
|
||||
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
|
||||
1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
|
||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
|
||||
1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
|
||||
1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
|
||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzm├бn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||
1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
|
||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [тАЛXLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||
|
||||
╨з╤В╨╛╨▒╤Л ╨┐╤А╨╛╨▓╨╡╤А╨╕╤В╤М, ╨╡╤Б╤В╤М ╨╗╨╕ ╤Г ╨║╨░╨╢╨┤╨╛╨╣ ╨╝╨╛╨┤╨╡╨╗╨╕ ╤А╨╡╨░╨╗╨╕╨╖╨░╤Ж╨╕╤П ╨╜╨░ Flax, PyTorch ╨╕╨╗╨╕ TensorFlow, ╨╕╨╗╨╕ ╤Б╨▓╤П╨╖╨░╨╜╨╜╤Л╨╣ ╤Б ╨╜╨╡╨╣ ╤В╨╛╨║╨╡╨╜╨╕╨╖╨░╤В╨╛╤А, ╨┐╨╛╨┤╨┤╨╡╤А╨╢╨╕╨▓╨░╨╡╨╝╤Л╨╣ ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╨╛╨╣ ЁЯдЧ Tokenizers, ╨╛╨▒╤А╨░╤В╨╕╤В╨╡╤Б╤М ╨║ [╤Н╤В╨╛╨╣ ╤В╨░╨▒╨╗╨╕╤Ж╨╡](https://huggingface.co/docs/transformers/index#supported-frameworks).
|
||||
|
||||
╨н╤В╨╕ ╤А╨╡╨░╨╗╨╕╨╖╨░╤Ж╨╕╨╕ ╨▒╤Л╨╗╨╕ ╨┐╤А╨╛╤В╨╡╤Б╤В╨╕╤А╨╛╨▓╨░╨╜╤Л ╨╜╨░ ╨╜╨╡╤Б╨║╨╛╨╗╤М╨║╨╕╤Е ╨╜╨░╨▒╨╛╤А╨░╤Е ╨┤╨░╨╜╨╜╤Л╤Е (╤Б╨╝. ╨┐╤А╨╕╨╝╨╡╤А╤Л ╤Б╨║╤А╨╕╨┐╤В╨╛╨▓) ╨╕ ╨┤╨╛╨╗╨╢╨╜╤Л ╤Б╨╛╨╛╤В╨▓╨╡╤В╤Б╤В╨▓╨╛╨▓╨░╤В╤М ╨┐╤А╨╛╨╕╨╖╨▓╨╛╨┤╨╕╤В╨╡╨╗╤М╨╜╨╛╤Б╤В╨╕ ╨╛╤А╨╕╨│╨╕╨╜╨░╨╗╤М╨╜╤Л╤Е ╤А╨╡╨░╨╗╨╕╨╖╨░╤Ж╨╕╨╣. ╨С╨╛╨╗╨╡╨╡ ╨┐╨╛╨┤╤А╨╛╨▒╨╜╤Г╤О ╨╕╨╜╤Д╨╛╤А╨╝╨░╤Ж╨╕╤О ╨╛ ╨┐╤А╨╛╨╕╨╖╨▓╨╛╨┤╨╕╤В╨╡╨╗╤М╨╜╨╛╤Б╤В╨╕ ╨╝╨╛╨╢╨╜╨╛ ╨╜╨░╨╣╤В╨╕ ╨▓ ╤А╨░╨╖╨┤╨╡╨╗╨╡ "╨Я╤А╨╕╨╝╨╡╤А╤Л" [╨┤╨╛╨║╤Г╨╝╨╡╨╜╤В╨░╤Ж╨╕╨╕](https://github.com/huggingface/transformers/tree/main/examples).
|
||||
|
||||
|
||||
## ╨Ш╨╖╤Г╤З╨╕ ╨▒╨╛╨╗╤М╤И╨╡
|
||||
|
||||
| ╨б╨╡╨║╤Ж╨╕╤П | ╨Ю╨┐╨╕╤Б╨░╨╜╨╕╨╡ |
|
||||
|-|-|
|
||||
| [╨Ф╨╛╨║╤Г╨╝╨╡╨╜╤В╨░╤Ж╨╕╤П](https://huggingface.co/docs/transformers/) | ╨Я╨╛╨╗╨╜╨░╤П ╨┤╨╛╨║╤Г╨╝╨╡╨╜╤В╨░╤Ж╨╕╤П ╨┐╨╛ API ╨╕ ╨│╨░╨╣╨┤╤Л |
|
||||
| [╨Ъ╤А╨░╤В╨║╨╕╨╡ ╨╛╨┐╨╕╤Б╨░╨╜╨╕╤П ╨╖╨░╨┤╨░╤З](https://huggingface.co/docs/transformers/task_summary) | ╨Ч╨░╨┤╨░╤З╨╕ ╨┐╨╛╨┤╨┤╨╡╤А╨╢╨╕╨▓╨░╤О╤В╤Б╤П ЁЯдЧ Transformers |
|
||||
| [╨Я╨╛╤Б╨╛╨▒╨╕╨╡ ╨┐╨╛ ╨┐╤А╨╡╨┤╨▓╨░╤А╨╕╤В╨╡╨╗╤М╨╜╨╛╨╣ ╨╛╨▒╤А╨░╨▒╨╛╤В╨║╨╡](https://huggingface.co/docs/transformers/preprocessing) | ╨Ш╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╨╡ ╨║╨╗╨░╤Б╤Б╨░ `Tokenizer` ╨┤╨╗╤П ╨┐╨╛╨┤╨│╨╛╤В╨╛╨▓╨║╨╕ ╨┤╨░╨╜╨╜╤Л╤Е ╨┤╨╗╤П ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ |
|
||||
| [╨Ю╨▒╤Г╤З╨╡╨╜╨╕╨╡ ╨╕ ╨┤╨╛╤А╨░╨▒╨╛╤В╨║╨░](https://huggingface.co/docs/transformers/training) | ╨Ш╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╨╡ ╨╝╨╛╨┤╨╡╨╗╨╡╨╣, ╨┐╤А╨╡╨┤╨╛╤Б╤В╨░╨▓╨╗╤П╨╡╨╝╤Л╤Е ЁЯдЧ Transformers, ╨▓ ╤Ж╨╕╨║╨╗╨╡ ╨╛╨▒╤Г╤З╨╡╨╜╨╕╤П PyTorch/TensorFlow ╨╕ API `Trainer`. |
|
||||
| [╨С╤Л╤Б╤В╤А╤Л╨╣ ╤В╤Г╤А: ╨в╨╛╨╜╨║╨░╤П ╨╜╨░╤Б╤В╤А╨╛╨╣╨║╨░/╤Б╨║╤А╨╕╨┐╤В╤Л ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╤П](https://github.com/huggingface/transformers/tree/main/examples) | ╨Я╤А╨╕╨╝╨╡╤А╤Л ╤Б╨║╤А╨╕╨┐╤В╨╛╨▓ ╨┤╨╗╤П ╤В╨╛╨╜╨║╨╛╨╣ ╨╜╨░╤Б╤В╤А╨╛╨╣╨║╨╕ ╨╝╨╛╨┤╨╡╨╗╨╡╨╣ ╨╜╨░ ╤И╨╕╤А╨╛╨║╨╛╨╝ ╤Б╨┐╨╡╨║╤В╤А╨╡ ╨╖╨░╨┤╨░╤З |
|
||||
| [╨б╨╛╨▓╨╝╨╡╤Б╤В╨╜╨╛╨╡ ╨╕╤Б╨┐╨╛╨╗╤М╨╖╨╛╨▓╨░╨╜╨╕╨╡ ╨╕ ╨╖╨░╨│╤А╤Г╨╖╨║╨░ ╨╝╨╛╨┤╨╡╨╗╨╡╨╣](https://huggingface.co/docs/transformers/model_sharing) | ╨Ч╨░╨│╤А╤Г╨╢╨░╨╣╤В╨╡ ╨╕ ╨┤╨╡╨╗╨╕╤В╨╡╤Б╤М ╤Б ╤Б╨╛╨╛╨▒╤Й╨╡╤Б╤В╨▓╨╛╨╝ ╤Б╨▓╨╛╨╕╨╝╨╕ ╨┤╨╛╤А╨░╨▒╨╛╤В╨░╨╜╨╜╤Л╨╝╨╕ ╨╝╨╛╨┤╨╡╨╗╤П╨╝╨╕ |
|
||||
|
||||
## ╨ж╨╕╤В╨╕╤А╨╛╨▓╨░╨╜╨╕╨╡
|
||||
|
||||
╨в╨╡╨┐╨╡╤А╤М ╤Г ╨╜╨░╤Б ╨╡╤Б╤В╤М [╤Б╤В╨░╤В╤М╤П](https://www.aclweb.org/anthology/2020.emnlp-demos.6/), ╨║╨╛╤В╨╛╤А╤Г╤О ╨╝╨╛╨╢╨╜╨╛ ╤Ж╨╕╤В╨╕╤А╨╛╨▓╨░╤В╤М ╨┤╨╗╤П ╨▒╨╕╨▒╨╗╨╕╨╛╤В╨╡╨║╨╕ ЁЯдЧ Transformers:
|
||||
```bibtex
|
||||
@inproceedings{wolf-etal-2020-transformers,
|
||||
title = "Transformers: State-of-the-Art Natural Language Processing",
|
||||
author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R├йmi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
|
||||
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
|
||||
month = oct,
|
||||
year = "2020",
|
||||
address = "Online",
|
||||
publisher = "Association for Computational Linguistics",
|
||||
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
|
||||
pages = "38--45"
|
||||
}
|
||||
```
|
||||
557
README_te.md
Normal file
557
README_te.md
Normal file
@ -0,0 +1,557 @@
|
||||
<!---
|
||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
|
||||
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
|
||||
</picture>
|
||||
<br/>
|
||||
<br/>
|
||||
</p>
|
||||
|
||||
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
|
||||
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/docs/transformers/index">
|
||||
<img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/releases">
|
||||
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
|
||||
<img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
|
||||
</a>
|
||||
<a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
|
||||
</p>
|
||||
|
||||
|
||||
<h4 align="center">
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers/">English</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">чоАф╜Уф╕нцЦЗ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">ч╣БщлФф╕нцЦЗ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">эХЬъ╡ньЦ┤</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ru.md">╨а╤Г╤Б╤Б╨║╨╕╨╣</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_pt-br.md">╨аortugu├кs</a> |
|
||||
<b>р░др▒Жр░▓р▒Бр░Чр▒Б</b> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
<p>JAX, PyTorch р░ор░░р░┐р░пр▒Б TensorFlow р░Хр▒Лр░╕р░В р░Ер░др▒Нр░пр░╛р░зр▒Бр░ир░┐р░Х р░пр░Вр░др▒Нр░░ р░Ер░нр▒Нр░пр░╛р░╕р░В</p>
|
||||
</h3>
|
||||
|
||||
<h3 align="center">
|
||||
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||
</h3>
|
||||
|
||||
ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░▓р▒Б р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒Н, р░╡р░┐р░Ьр░ир▒Н р░ор░░р░┐р░пр▒Б р░Жр░бр░┐р░пр▒Л р░╡р░Вр░Яр░┐ р░╡р░┐р░нр░┐р░ир▒Нр░и р░кр░жр▒Нр░зр░др▒Бр░▓р░кр▒И р░Яр░╛р░╕р▒Нр░Хр▒НтАМр░▓р░ир▒Б р░ир░┐р░░р▒Нр░╡р░╣р░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░╡р▒Зр░▓р░╛р░жр░┐ р░ор▒Бр░Вр░жр▒Бр░Чр░╛ р░╢р░┐р░Хр▒Нр░╖р░г р░кр▒Кр░Вр░жр░┐р░и р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░Ер░Вр░жр░┐р░╕р▒Нр░др░╛р░пр░┐.
|
||||
|
||||
р░И р░ир░ор▒Вр░ир░╛р░▓р▒Б р░╡р░░р▒Нр░др░┐р░Вр░Ър░╡р░Ър▒Нр░Ър▒Б:
|
||||
|
||||
* ЁЯУЭ р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒Н, 100р░Хр░┐ р░кр▒Ир░Чр░╛ р░нр░╛р░╖р░▓р▒Нр░▓р▒Л р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒Н р░Хр▒Нр░▓р░╛р░╕р░┐р░лр░┐р░Хр▒Зр░╖р░ир▒Н, р░Зр░ир▒Нр░лр░░р▒Нр░ор▒Зр░╖р░ир▒Н р░Ор░Хр▒Нр░╕р▒НтАМр░Яр▒Нр░░р░╛р░Хр▒Нр░╖р░ир▒Н, р░кр▒Нр░░р░╢р▒Нр░ир░▓р░Хр▒Б р░╕р░ор░╛р░зр░╛р░ир░╛р░▓р▒Б, р░╕р░╛р░░р░╛р░Вр░╢р░В, р░Ер░ир▒Бр░╡р░╛р░жр░В, р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒Н р░Ьр░ир░░р▒Зр░╖р░ир▒Н р░╡р░Вр░Яр░┐ р░кр░ир▒Бр░▓ р░Хр▒Лр░╕р░В.
|
||||
* ЁЯЦ╝я╕П р░Зр░ор▒Зр░Ьр▒НтАМр░▓р▒Б, р░Зр░ор▒Зр░Ьр▒Н р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░г, р░Жр░мр▒Нр░Ьр▒Жр░Хр▒Нр░Яр▒Н р░бр░┐р░Яр▒Жр░Хр▒Нр░╖р░ир▒Н р░ор░░р░┐р░пр▒Б р░╕р▒Жр░Чр▒Нр░ор▒Жр░Вр░Яр▒Зр░╖р░ир▒Н р░╡р░Вр░Яр░┐ р░кр░ир▒Бр░▓ р░Хр▒Лр░╕р░В.
|
||||
* ЁЯЧгя╕П р░Жр░бр░┐р░пр▒Л, р░╕р▒Нр░кр▒Ар░Ър▒Н р░░р░┐р░Хр░Чр▒Нр░ир░┐р░╖р░ир▒Н р░ор░░р░┐р░пр▒Б р░Жр░бр░┐р░пр▒Л р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░г р░╡р░Вр░Яр░┐ р░кр░ир▒Бр░▓ р░Хр▒Лр░╕р░В.
|
||||
|
||||
р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р▒Б р░Яр▒Зр░мр▒Бр░▓р▒Н р░Хр▒Нр░╡р░╢р▒Нр░Ър░ир▒Н р░Жр░ир▒Нр░╕р░░р▒Н р░Ър▒Зр░пр░бр░В, р░Жр░кр▒Нр░Яр░┐р░Хр░▓р▒Н р░Хр▒Нр░пр░╛р░░р▒Жр░Хр▒Нр░Яр░░р▒Н р░░р░┐р░Хр░Чр▒Нр░ир░┐р░╖р░ир▒Н, р░╕р▒Нр░Хр░╛р░ир▒Н р░Ър▒Зр░╕р░┐р░и р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒НтАМр░▓ р░ир▒Бр░Вр░бр░┐ р░Зр░ир▒Нр░лр░░р▒Нр░ор▒Зр░╖р░ир▒Н р░Ор░Хр▒Нр░╕р▒НтАМр░Яр▒Нр░░р░╛р░Хр▒Нр░╖р░ир▒Н, р░╡р▒Ар░бр░┐р░пр▒Л р░Хр▒Нр░▓р░╛р░╕р░┐р░лр░┐р░Хр▒Зр░╖р░ир▒Н р░ор░░р░┐р░пр▒Б р░╡р░┐р░Ьр▒Бр░╡р░▓р▒Н р░Хр▒Нр░╡р░╢р▒Нр░Ър░ир▒Н р░Жр░ир▒Нр░╕р░░р▒Н р░Ър▒Зр░пр░бр░В р░╡р░Вр░Яр░┐ **р░Ер░ир▒Зр░Х р░кр░жр▒Нр░зр░др▒Бр░▓р░др▒Л р░Хр░▓р░┐р░кр░┐** р░кр░ир▒Бр░▓р░ир▒Б р░Хр▒Вр░бр░╛ р░Ър▒Зр░пр░Чр░▓р░╡р▒Б.
|
||||
|
||||
ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░▓р▒Б р░Ер░Вр░жр░┐р░Вр░Ър░┐р░и р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒НтАМр░▓р▒Л р░кр▒Нр░░р▒Ар░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░др▒Нр░╡р░░р░Чр░╛ р░бр▒Мр░ир▒НтАМр░▓р▒Лр░бр▒Н р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ р░ор░░р░┐р░пр▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐, р░╡р░╛р░Яр░┐р░ир░┐ р░ор▒А р░╕р▒Нр░╡р░Вр░д р░бр▒Зр░Яр░╛р░╕р▒Жр░Яр▒НтАМр░▓р░▓р▒Л р░лр▒Ир░ир▒Н-р░Яр▒Нр░пр▒Вр░ир▒Н р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ р░ор░░р░┐р░пр▒Б р░╡р░╛р░Яр░┐р░ир░┐ р░ор░╛ [р░ор▒Лр░бр░▓р▒Н р░╣р░мр▒Н](https://huggingface.co/models)р░▓р▒Л р░╕р░Вр░Шр░Вр░др▒Л р░нр░╛р░Чр░╕р▒Нр░╡р░╛р░ор▒Нр░пр░В р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ API р░▓р░ир▒Б р░Ер░Вр░жр░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐. р░Ер░жр▒З р░╕р░ор░пр░Вр░▓р▒Л, р░Жр░░р▒Нр░Хр░┐р░Яр▒Жр░Хр▒Нр░Ър░░р▒НтАМр░ир░┐ р░ир░┐р░░р▒Нр░╡р░Ър░┐р░Вр░Ър▒З р░кр▒Нр░░р░др░┐ р░кр▒Ир░ер░╛р░ир▒Н р░ор░╛р░бр▒Нр░пр▒Вр░▓р▒Н р░кр▒Вр░░р▒Нр░др░┐р░Чр░╛ р░╕р▒Нр░╡р░др░Вр░др▒Нр░░р░Вр░Чр░╛ р░Йр░Вр░Яр▒Бр░Вр░жр░┐ р░ор░░р░┐р░пр▒Б р░др▒Нр░╡р░░р░┐р░д р░кр░░р░┐р░╢р▒Лр░зр░и р░кр▒Нр░░р░пр▒Лр░Чр░╛р░▓р░ир▒Б р░кр▒Нр░░р░╛р░░р░Вр░нр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░╕р░╡р░░р░┐р░Вр░Ър░╡р░Ър▒Нр░Ър▒Б.
|
||||
|
||||
ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░Хр▒Б р░ор▒Вр░бр▒Б р░Ер░др▒Нр░пр░Вр░д р░кр▒Нр░░р░Ьр░╛р░жр░░р░г р░кр▒Кр░Вр░жр░┐р░и р░бр▒Ар░кр▒Н р░▓р▒Жр░░р▒Нр░ир░┐р░Вр░Чр▒Н р░▓р▒Ир░мр▒Нр░░р░░р▒Ар░▓р▒Б р░Йр░ир▒Нр░ир░╛р░пр░┐ тАФ [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) р░ор░░р░┐р░пр▒Б [TensorFlow](https://www.tensorflow.org/) тАФ р░╡р░╛р░Яр░┐ р░ор░зр▒Нр░п р░Ер░др▒Бр░Хр▒Бр░▓р▒Б р░▓р▒Зр░ир░┐ р░Пр░Хр▒Ар░Хр░░р░гр░др▒Л. р░ор▒А р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░Тр░Хр░жр░╛р░ир░┐р░др▒Л р░ор░░р▒Кр░Хр░жр░╛р░ир░┐р░др▒Л р░Ер░ир▒Бр░ор░┐р░др░┐ р░Хр▒Лр░╕р░В р░▓р▒Лр░бр▒Н р░Ър▒Зр░╕р▒З р░ор▒Бр░Вр░жр▒Б р░╡р░╛р░Яр░┐р░Хр░┐ р░╢р░┐р░Хр▒Нр░╖р░г р░Зр░╡р▒Нр░╡р░бр░В р░Ър░╛р░▓р░╛ р░╕р▒Бр░▓р░нр░В.
|
||||
|
||||
## р░Жр░ир▒НтАМр░▓р▒Ир░ир▒Н р░бр▒Жр░ор▒Лр░▓р▒Б
|
||||
|
||||
р░ор▒Ар░░р▒Б [р░ор▒Лр░бр░▓р▒Н р░╣р░мр▒Н](https://huggingface.co/models) р░ир▒Бр░Вр░бр░┐ р░ор░╛ р░ор▒Лр░бр░│р▒Нр░▓р░▓р▒Л р░Ър░╛р░▓р░╛ р░╡р░░р░Хр▒Б р░╡р░╛р░Яр░┐ р░кр▒Зр░Ьр▒Ар░▓р░▓р▒Л р░ир▒Зр░░р▒Бр░Чр░╛ р░кр░░р▒Ар░Хр▒Нр░╖р░┐р░Вр░Ър░╡р░Ър▒Нр░Ър▒Б. р░ор▒Зр░ор▒Б р░кр░мр▒Нр░▓р░┐р░Хр▒Н р░ор░░р░┐р░пр▒Б р░кр▒Нр░░р▒Ир░╡р▒Зр░Яр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓ р░Хр▒Лр░╕р░В [р░кр▒Нр░░р▒Ир░╡р▒Зр░Яр▒Н р░ор▒Лр░бр░▓р▒Н р░╣р▒Лр░╕р▒Нр░Яр░┐р░Вр░Чр▒Н, р░╕р░Вр░╕р▒Нр░Хр░░р░г & р░Ер░ир▒Бр░ор░┐р░др░┐ API](https://huggingface.co/pricing)р░ир░┐ р░Хр▒Вр░бр░╛ р░Ер░Вр░жр░┐р░╕р▒Нр░др░╛р░ор▒Б.
|
||||
|
||||
р░Зр░Хр▒Нр░Хр░б р░Хр▒Кр░ир▒Нр░ир░┐ р░Йр░жр░╛р░╣р░░р░гр░▓р▒Б р░Йр░ир▒Нр░ир░╛р░пр░┐:
|
||||
|
||||
р░╕р░╣р░Ь р░нр░╛р░╖р░╛ р░кр▒Нр░░р░╛р░╕р▒Жр░╕р░┐р░Вр░Чр▒НтАМр░▓р▒Л:
|
||||
- [BERT р░др▒Л р░ор░╛р░╕р▒Нр░Хр▒НтАМр░бр▒Н р░╡р░░р▒Нр░бр▒Н р░Хр░Вр░кр▒Нр░▓р▒Ар░╖р░ир▒Н](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
|
||||
- [Electra р░др▒Л р░кр▒Зр░░р▒Б р░Ор░Вр░Яр░┐р░Яр▒А р░Чр▒Бр░░р▒Нр░др░┐р░Вр░кр▒Б](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
|
||||
- [GPT-2 р░др▒Л р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒Н р░Ьр░ир░░р▒Зр░╖р░ир▒Н](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
|
||||
- [RoBERTa р░др▒Л р░╕р░╣р░Ь р░нр░╛р░╖р░╛ р░Ер░ир▒Бр░ор░┐р░др░┐](https://huggingface.co/roberta-large-mnli?text=The+dog+was+Lost.+Nobody+lost+any+animal)
|
||||
- [BART р░др▒Л р░╕р░╛р░░р░╛р░Вр░╢р░В](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
|
||||
- [DistilBERT р░др▒Л р░кр▒Нр░░р░╢р▒Нр░и р░╕р░ор░╛р░зр░╛р░ир░В](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
|
||||
- [T5 р░др▒Л р░Ер░ир▒Бр░╡р░╛р░жр░В](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
|
||||
|
||||
р░Хр░Вр░кр▒Нр░пр▒Вр░Яр░░р▒Н р░жр▒Гр░╖р▒Нр░Яр░┐р░▓р▒Л:
|
||||
- [VIT р░др▒Л р░Ър░┐р░др▒Нр░░ р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░г](https://huggingface.co/google/vit-base-patch16-224)
|
||||
- [DETR р░др▒Л р░Жр░мр▒Нр░Ьр▒Жр░Хр▒Нр░Яр▒Н р░бр░┐р░Яр▒Жр░Хр▒Нр░╖р░ир▒Н](https://huggingface.co/facebook/detr-resnet-50)
|
||||
- [SegFormer р░др▒Л р░╕р▒Жр░ор░╛р░Вр░Яр░┐р░Хр▒Н р░╕р▒Жр░Чр▒Нр░ор▒Жр░Вр░Яр▒Зр░╖р░ир▒Н](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
|
||||
- [MaskFormer р░др▒Л р░кр░╛р░ир▒Лр░кр▒Нр░Яр░┐р░Хр▒Н р░╕р▒Жр░Чр▒Нр░ор▒Жр░Вр░Яр▒Зр░╖р░ир▒Н](https://huggingface.co/facebook/maskformer-swin-small-coco)
|
||||
- [DPT р░др▒Л р░▓р▒Лр░др▒Б р░Ер░Вр░Ър░ир░╛](https://huggingface.co/docs/transformers/model_doc/dpt)
|
||||
- [VideoMAE р░др▒Л р░╡р▒Ар░бр░┐р░пр▒Л р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░г](https://huggingface.co/docs/transformers/model_doc/videomae)
|
||||
- [OneFormer р░др▒Л р░пр▒Вр░ир░┐р░╡р░░р▒Нр░╕р░▓р▒Н р░╕р▒Жр░Чр▒Нр░ор▒Жр░Вр░Яр▒Зр░╖р░ир▒Н](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
|
||||
|
||||
р░Жр░бр░┐р░пр▒Лр░▓р▒Л:
|
||||
- [Wav2Vec2 р░др▒Л р░Жр░Яр▒Лр░ор▒Зр░Яр░┐р░Хр▒Н р░╕р▒Нр░кр▒Ар░Ър▒Н р░░р░┐р░Хр░Чр▒Нр░ир░┐р░╖р░ир▒Н](https://huggingface.co/facebook/wav2vec2-base-960h)
|
||||
- [Wav2Vec2 р░др▒Л р░Хр▒Ар░╡р░░р▒Нр░бр▒Н р░╕р▒Нр░кр░╛р░Яр░┐р░Вр░Чр▒Н](https://huggingface.co/superb/wav2vec2-base-superb-ks)
|
||||
- [р░Жр░бр░┐р░пр▒Л р░╕р▒Нр░кр▒Жр░Хр▒Нр░Яр▒Нр░░р▒Лр░Чр▒Нр░░р░╛р░ор▒Н р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░др▒Л р░Жр░бр░┐р░пр▒Л р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░г](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
|
||||
|
||||
р░ор░▓р▒Нр░Яр▒Ар░ор▒Лр░бр░▓р▒Н р░Яр░╛р░╕р▒Нр░Хр▒НтАМр░▓р░▓р▒Л:
|
||||
- [TAPAS р░др▒Л р░Яр▒Зр░мр▒Бр░▓р▒Н р░кр▒Нр░░р░╢р▒Нр░и р░╕р░ор░╛р░зр░╛р░ир░╛р░▓р▒Б](https://huggingface.co/google/tapas-base-finetuned-wtq)
|
||||
- [ViLT р░др▒Л р░жр▒Гр░╢р▒Нр░пр░ор░╛р░и р░кр▒Нр░░р░╢р▒Нр░ир░Хр▒Б р░╕р░ор░╛р░зр░╛р░ир░В](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
|
||||
- [CLIP р░др▒Л р░Ьр▒Ар░░р▒Л-р░╖р░╛р░Яр▒Н р░Зр░ор▒Зр░Ьр▒Н р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░г](https://huggingface.co/openai/clip-vit-large-patch14)
|
||||
- [LayoutLM р░др▒Л р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░кр▒Нр░░р░╢р▒Нр░ир░Хр▒Б р░╕р░ор░╛р░зр░╛р░ир░В](https://huggingface.co/impira/layoutlm-document-qa)
|
||||
- [X-CLIP р░др▒Л р░Ьр▒Ар░░р▒Л-р░╖р░╛р░Яр▒Н р░╡р▒Ар░бр░┐р░пр▒Л р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░г](https://huggingface.co/docs/transformers/model_doc/xclip)
|
||||
|
||||
## р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░ир▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░┐ 100 р░кр▒Нр░░р░╛р░Ьр▒Жр░Хр▒Нр░Яр▒Бр░▓р▒Б
|
||||
|
||||
р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░▓р▒Б р░кр▒Нр░░р▒Ар░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░Яр▒Вр░▓р▒НтАМр░Хр░┐р░Яр▒Н р░Хр░Вр░Яр▒З р░Ор░Хр▒Нр░Хр▒Бр░╡: р░Зр░жр░┐ р░жр░╛р░ир░┐ р░Ър▒Бр░Яр▒Нр░Яр▒В р░ир░┐р░░р▒Нр░ор░┐р░Вр░Ър░┐р░и р░кр▒Нр░░р░╛р░Ьр▒Жр░Хр▒Нр░Яр▒НтАМр░▓ р░╕р░Вр░Шр░В р░ор░░р░┐р░пр▒Б
|
||||
р░╣р░Чр▒Нр░Чр░┐р░Вр░Чр▒Н р░лр▒Зр░╕р▒Н р░╣р░мр▒Н. р░бр▒Жр░╡р░▓р░кр░░р▒НтАМр░▓р▒Б, р░кр░░р░┐р░╢р▒Лр░зр░Хр▒Бр░▓р▒Б, р░╡р░┐р░жр▒Нр░пр░╛р░░р▒Нр░ер▒Бр░▓р▒Б, р░кр▒Нр░░р▒Кр░лр▒Жр░╕р░░р▒НтАМр░▓р▒Б, р░Зр░Вр░Ьр░ир▒Ар░░р▒Нр░▓р▒Б р░ор░░р░┐р░пр▒Б р░Ор░╡р░░р░┐р░ир▒Ир░ир░╛ р░Ер░ир▒Бр░ор░др░┐р░Вр░Ър▒Зр░▓р░╛ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░ир▒Б р░ор▒Зр░ор▒Б р░Хр▒Лр░░р▒Бр░Хр▒Бр░Вр░Яр▒Бр░ир▒Нр░ир░╛р░ор▒Б
|
||||
р░╡р░╛р░░р░┐ р░Хр░▓р░▓ р░кр▒Нр░░р░╛р░Ьр▒Жр░Хр▒Нр░Яр▒Бр░▓р░ир▒Б р░ир░┐р░░р▒Нр░ор░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐.
|
||||
|
||||
р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓ 100,000 р░ир░Хр▒Нр░╖р░др▒Нр░░р░╛р░▓р░ир▒Б р░Ьр░░р▒Бр░кр▒Бр░Хр▒Лр░╡р░бр░╛р░ир░┐р░Хр░┐, р░ор▒Зр░ор▒Б р░╕р▒Нр░кр░╛р░Яр▒НтАМр░▓р▒Ир░Яр▒НтАМр░ир░┐ р░Йр░Вр░Ър░╛р░▓р░ир░┐ р░ир░┐р░░р▒Нр░гр░пр░┐р░Вр░Ър▒Бр░Хр▒Бр░ир▒Нр░ир░╛р░ор▒Б
|
||||
р░╕р░Вр░Шр░В, р░ор░░р░┐р░пр▒Б р░ор▒Зр░ор▒Б 100 р░Ьр░╛р░мр░┐р░др░╛р░▓р░ир▒Б р░Хр░▓р░┐р░Чр░┐ р░Йр░ир▒Нр░и [awesome-transformers](./awesome-transformers.md) р░кр▒Зр░Ьр▒Ар░ир░┐ р░╕р▒Гр░╖р▒Нр░Яр░┐р░Вр░Ър░╛р░ор▒Б.
|
||||
р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░▓ р░кр░░р░┐р░╕р░░р░╛р░▓р▒Нр░▓р▒Л р░Ер░жр▒Нр░нр▒Бр░др░ор▒Ир░и р░кр▒Нр░░р░╛р░Ьр▒Жр░Хр▒Нр░Яр▒Бр░▓р▒Б р░ир░┐р░░р▒Нр░ор░┐р░Вр░Ър░мр░бр▒Нр░бр░╛р░пр░┐.
|
||||
|
||||
р░Ьр░╛р░мр░┐р░др░╛р░▓р▒Л р░нр░╛р░Чр░ор░ир░┐ р░ор▒Ар░░р▒Б р░╡р░┐р░╢р▒Нр░╡р░╕р░┐р░Вр░Ър▒З р░кр▒Нр░░р░╛р░Ьр▒Жр░Хр▒Нр░Яр▒НтАМр░ир▒Б р░ор▒Ар░░р▒Б р░Хр░▓р░┐р░Чр░┐ р░Йр░Вр░Яр▒З р░▓р▒Зр░жр░╛ р░Йр░кр░пр▒Лр░Чр░┐р░╕р▒Нр░др▒Бр░Вр░Яр▒З, р░жр░пр░Ър▒Зр░╕р░┐ р░жр░╛р░ир░┐р░ир░┐ р░Ьр▒Лр░бр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ PRр░ир░┐ р░др▒Жр░░р░╡р░Вр░бр░┐!
|
||||
|
||||
## р░ор▒Ар░░р▒Б р░╣р░Чр▒Нр░Чр░┐р░Вр░Чр▒Н р░лр▒Зр░╕р▒Н р░Яр▒Ар░ор▒Н р░ир▒Бр░Вр░бр░┐ р░Ер░ир▒Бр░Хр▒Вр░▓ р░ор░жр▒Нр░жр░др▒Б р░Хр▒Лр░╕р░В р░Ър▒Вр░╕р▒Нр░др▒Бр░ир▒Нр░ир░Яр▒Нр░▓р░пр░┐р░др▒З
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/support">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
</a><br>
|
||||
|
||||
## р░др▒Нр░╡р░░р░┐р░д р░кр░░р▒Нр░пр░Яр░и
|
||||
|
||||
р░Зр░Ър▒Нр░Ър░┐р░и р░Зр░ир▒НтАМр░кр▒Бр░Яр▒Н (р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒Н, р░Зр░ор▒Зр░Ьр▒Н, р░Жр░бр░┐р░пр▒Л, ...)р░кр▒И р░др░Хр▒Нр░╖р░гр░ор▒З р░ор▒Лр░бр░▓р▒НтАМр░ир▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐, р░ор▒Зр░ор▒Б `pipeline` API р░ир░┐ р░Ер░Вр░жр░┐р░╕р▒Нр░др░╛р░ор▒Б. р░кр▒Ир░кр▒НтАМр░▓р▒Ир░ир▒НтАМр░▓р▒Б р░Ж р░ор▒Лр░бр░▓р▒Н р░╢р░┐р░Хр▒Нр░╖р░г р░╕р░ор░пр░Вр░▓р▒Л р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░┐р░и р░кр▒Нр░░р▒Ар░кр▒Нр░░р░╛р░╕р▒Жр░╕р░┐р░Вр░Чр▒НтАМр░др▒Л р░Хр▒Вр░бр░┐р░и р░кр▒Нр░░р▒Ар░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░ир▒Б р░╕р░ор▒Вр░╣р░кр░░р▒Бр░╕р▒Нр░др░╛р░пр░┐. р░╕р░╛р░ир▒Бр░Хр▒Вр░▓ р░ор░░р░┐р░пр▒Б р░кр▒Нр░░р░др░┐р░Хр▒Вр░▓ р░кр░╛р░ар░╛р░▓р░ир▒Б р░╡р░░р▒Нр░Чр▒Ар░Хр░░р░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░кр▒Ир░кр▒НтАМр░▓р▒Ир░ир▒НтАМр░ир▒Б р░др▒Нр░╡р░░р░Чр░╛ р░Ор░▓р░╛ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░╛р░▓р▒Л р░Зр░Хр▒Нр░Хр░б р░Йр░Вр░жр░┐:
|
||||
|
||||
```python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# Allocate a pipeline for sentiment-analysis
|
||||
>>> classifier = pipeline('sentiment-analysis')
|
||||
>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
|
||||
[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
|
||||
```
|
||||
|
||||
р░░р▒Жр░Вр░бр░╡ р░▓р▒Ир░ир▒Н р░Хр▒Лр░бр▒Н р░бр▒Мр░ир▒НтАМр░▓р▒Лр░бр▒Н р░ор░░р░┐р░пр▒Б р░кр▒Ир░кр▒НтАМр░▓р▒Ир░ир▒Н р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър▒З р░кр▒Нр░░р▒Ар░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░ир▒Б р░Хр░╛р░╖р▒Н р░Ър▒Зр░╕р▒Нр░др▒Бр░Вр░жр░┐, р░ор▒Вр░бр░╡р░жр░┐ р░Зр░Ър▒Нр░Ър░┐р░и р░Яр▒Жр░Хр▒Нр░╕р▒Нр░Яр▒НтАМр░кр▒И р░ор▒Вр░▓р▒Нр░пр░╛р░Вр░Хр░ир░В р░Ър▒Зр░╕р▒Нр░др▒Бр░Вр░жр░┐. р░Зр░Хр▒Нр░Хр░б р░╕р░ор░╛р░зр░╛р░ир░В 99.97% р░╡р░┐р░╢р▒Нр░╡р░╛р░╕р░Вр░др▒Л "р░кр░╛р░Ьр░┐р░Яр░┐р░╡р▒Н".
|
||||
|
||||
р░Ър░╛р░▓р░╛ р░кр░ир▒Бр░▓р▒Б NLPр░▓р▒Л р░Хр░╛р░ир▒А р░Хр░Вр░кр▒Нр░пр▒Вр░Яр░░р▒Н р░╡р░┐р░Ьр░ир▒Н р░ор░░р░┐р░пр▒Б р░╕р▒Нр░кр▒Ар░Ър▒НтАМр░▓р▒Л р░Хр▒Вр░бр░╛ р░ор▒Бр░Вр░жр▒Бр░Чр░╛ р░╢р░┐р░Хр▒Нр░╖р░г р░кр▒Кр░Вр░жр░┐р░и `pipeline` р░╕р░┐р░жр▒Нр░зр░Вр░Чр░╛ р░Йр░ир▒Нр░ир░╛р░пр░┐. р░Йр░жр░╛р░╣р░░р░гр░Хр▒Б, р░ор░ир░В р░Ър░┐р░др▒Нр░░р░Вр░▓р▒Л р░Чр▒Бр░░р▒Нр░др░┐р░Вр░Ър░┐р░и р░╡р░╕р▒Нр░др▒Бр░╡р▒Бр░▓р░ир▒Б р░╕р▒Бр░▓р░нр░Вр░Чр░╛ р░╕р░Вр░Чр▒Нр░░р░╣р░┐р░Вр░Ър░╡р░Ър▒Нр░Ър▒Б:
|
||||
|
||||
``` python
|
||||
>>> import requests
|
||||
>>> from PIL import Image
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# Download an image with cute cats
|
||||
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
|
||||
>>> image_data = requests.get(url, stream=True).raw
|
||||
>>> image = Image.open(image_data)
|
||||
|
||||
# Allocate a pipeline for object detection
|
||||
>>> object_detector = pipeline('object-detection')
|
||||
>>> object_detector(image)
|
||||
[{'score': 0.9982201457023621,
|
||||
'label': 'remote',
|
||||
'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
|
||||
{'score': 0.9960021376609802,
|
||||
'label': 'remote',
|
||||
'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
|
||||
{'score': 0.9954745173454285,
|
||||
'label': 'couch',
|
||||
'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
|
||||
{'score': 0.9988006353378296,
|
||||
'label': 'cat',
|
||||
'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
|
||||
{'score': 0.9986783862113953,
|
||||
'label': 'cat',
|
||||
'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
|
||||
```
|
||||
|
||||
р░Зр░Хр▒Нр░Хр░б р░ор░ир░В р░Жр░мр▒Нр░Ьр▒Жр░Хр▒Нр░Яр▒Н р░Ър▒Бр░Яр▒Нр░Яр▒В р░Йр░ир▒Нр░и р░мр░╛р░Хр▒Нр░╕р▒Н р░ор░░р░┐р░пр▒Б р░Хр░╛р░ир▒Нр░лр░┐р░бр▒Жр░ир▒Нр░╕р▒Н р░╕р▒Нр░Хр▒Лр░░р▒НтАМр░др▒Л р░Ър░┐р░др▒Нр░░р░Вр░▓р▒Л р░Чр▒Бр░░р▒Нр░др░┐р░Вр░Ър░мр░бр░┐р░и р░╡р░╕р▒Нр░др▒Бр░╡р▒Бр░▓ р░Ьр░╛р░мр░┐р░др░╛р░ир▒Б р░кр▒Кр░Вр░жр▒Бр░др░╛р░ор▒Б. р░Зр░Хр▒Нр░Хр░б р░Ор░бр░ор░╡р▒Ир░кр▒Бр░и р░Йр░ир▒Нр░и р░Ер░╕р░▓р▒Б р░Ър░┐р░др▒Нр░░р░В, р░Хр▒Бр░бр░┐р░╡р▒Ир░кр▒Бр░и р░Ер░Вр░Ър░ир░╛р░▓р▒Б р░кр▒Нр░░р░жр░░р▒Нр░╢р░┐р░Вр░Ър░мр░бр░др░╛р░пр░┐:
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
|
||||
</h3>
|
||||
|
||||
р░ор▒Ар░░р▒Б [р░И р░Яр▒Нр░пр▒Бр░Яр▒Лр░░р░┐р░пр░▓р▒Н](https://huggingface.co/docs/transformers/task_summary)р░▓р▒Л `pipeline` API р░жр▒Нр░╡р░╛р░░р░╛ р░╕р░кр▒Лр░░р▒Нр░Яр▒Н р░Ър▒Зр░╕р▒З р░Яр░╛р░╕р▒Нр░Хр▒НтАМр░▓ р░Чр▒Бр░░р░┐р░Вр░Ър░┐ р░ор░░р░┐р░Вр░д р░др▒Жр░▓р▒Бр░╕р▒Бр░Хр▒Лр░╡р░Ър▒Нр░Ър▒Б.
|
||||
|
||||
`pipeline`р░др▒Л р░кр░╛р░Яр▒Б, р░ор▒Ар░░р▒Б р░Зр░Ър▒Нр░Ър░┐р░и р░Яр░╛р░╕р▒Нр░Хр▒НтАМр░▓р▒Л р░Пр░жр▒Ир░ир░╛ р░кр▒Нр░░р▒Ар░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░бр▒Мр░ир▒НтАМр░▓р▒Лр░бр▒Н р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ р░ор░░р░┐р░пр▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐, р░жр▒Ар░ир░┐р░Хр░┐ р░ор▒Вр░бр▒Б р░▓р▒Ир░ир▒Нр░▓ р░Хр▒Лр░бр▒Н р░╕р░░р░┐р░кр▒Лр░др▒Бр░Вр░жр░┐. р░Зр░Хр▒Нр░Хр░б PyTorch р░╡р▒Жр░░р▒Нр░╖р░ир▒Н р░Йр░Вр░жр░┐:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, AutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
р░ор░░р░┐р░пр▒Б TensorFlow р░Хр░┐ р░╕р░ор░╛р░ир░ор▒Ир░и р░Хр▒Лр░бр▒Н р░Зр░Хр▒Нр░Хр░б р░Йр░Вр░жр░┐:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, TFAutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
р░кр▒Нр░░р░┐р░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒Н р░Жр░╢р░┐р░Вр░Ър▒З р░Ер░ир▒Нр░ир░┐ р░кр▒Нр░░р▒Ар░кр▒Нр░░р░╛р░╕р▒Жр░╕р░┐р░Вр░Чр▒НтАМр░▓р░Хр▒Б р░Яр▒Лр░Хр▒Жр░ир▒Ир░Ьр░░р▒Н р░мр░╛р░зр▒Нр░пр░д р░╡р░╣р░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐ р░ор░░р░┐р░пр▒Б р░ир▒Зр░░р▒Бр░Чр░╛ р░Тр░Хр▒З р░╕р▒Нр░Яр▒Нр░░р░┐р░Вр░Чр▒Н (р░кр▒И р░Йр░жр░╛р░╣р░░р░гр░▓р░▓р▒Л р░╡р░▓р▒Ж) р░▓р▒Зр░жр░╛ р░Ьр░╛р░мр░┐р░др░╛р░кр▒И р░Хр░╛р░▓р▒Н р░Ър▒Зр░пр░╡р░Ър▒Нр░Ър▒Б. р░Зр░жр░┐ р░ор▒Ар░░р▒Б р░бр▒Мр░ир▒НтАМр░╕р▒Нр░Яр▒Нр░░р▒Ар░ор▒Н р░Хр▒Лр░бр▒НтАМр░▓р▒Л р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░Чр░▓ р░ир░┐р░Шр░Вр░Яр▒Бр░╡р▒Бр░ир░┐ р░Ер░╡р▒Бр░Яр▒НтАМр░кр▒Бр░Яр▒Н р░Ър▒Зр░╕р▒Нр░др▒Бр░Вр░жр░┐ р░▓р▒Зр░жр░╛ ** р░Жр░░р▒Нр░Чр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░Ер░ир▒НтАМр░кр▒Нр░пр░╛р░Хр░┐р░Вр░Чр▒Н р░Жр░кр░░р▒Зр░Яр░░р▒НтАМр░ир░┐ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░┐ р░ир▒Зр░░р▒Бр░Чр░╛ р░ор▒А р░ор▒Лр░бр░▓р▒НтАМр░Хр░┐ р░кр░Вр░кр▒Бр░др▒Бр░Вр░жр░┐.
|
||||
|
||||
р░ор▒Лр░бр░▓р▒Н р░Хр▒Вр░бр░╛ р░╕р░╛р░зр░╛р░░р░г [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) р░▓р▒Зр░жр░╛ [TensorFlow `tf.keras.Model`]( https://www.tensorflow.org/api_docs/python/tf/keras/Model) (р░ор▒А р░мр▒Нр░пр░╛р░Хр▒Жр░Вр░бр▒НтАМр░ир░┐ р░мр░Яр▒Нр░Яр░┐) р░ор▒Ар░░р▒Б р░ор░╛р░ор▒Вр░▓р▒Бр░Чр░╛ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░╡р░Ър▒Нр░Ър▒Б. [р░И р░Яр▒Нр░пр▒Бр░Яр▒Лр░░р░┐р░пр░▓р▒Н](https://huggingface.co/docs/transformers/training) р░Ер░Яр▒Бр░╡р░Вр░Яр░┐ р░ор▒Лр░бр░▓р▒НтАМр░ир░┐ р░Хр▒Нр░▓р░╛р░╕р░┐р░Хр▒Н PyTorch р░▓р▒Зр░жр░╛ TensorFlow р░Яр▒Нр░░р▒Ир░ир░┐р░Вр░Чр▒Н р░▓р▒Вр░кр▒НтАМр░▓р▒Л р░Ор░▓р░╛ р░Зр░Вр░Яр░┐р░Чр▒Нр░░р▒Зр░Яр▒Н р░Ър▒Зр░пр░╛р░▓р▒Л р░▓р▒Зр░жр░╛ р░ор░╛ `Trainer` API р░ир░┐ р░Ор░▓р░╛ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░╛р░▓р▒Л р░╡р░┐р░╡р░░р░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐ р░Хр▒Кр░др▒Нр░д р░бр▒Зр░Яр░╛р░╕р▒Жр░Яр▒Н.
|
||||
|
||||
## р░ир▒Зр░ир▒Б р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░ир▒Б р░Ор░Вр░жр▒Бр░Хр▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░╛р░▓р░┐?
|
||||
|
||||
1. р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░╕р▒Бр░▓р░нр░ор▒Ир░и р░╕р▒Нр░Яр▒Зр░Яр▒Н р░Жр░лр▒Н р░жр░┐ р░Жр░░р▒Нр░Яр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р▒Б:
|
||||
- р░╕р░╣р░Ь р░нр░╛р░╖р░╛ р░Ер░╡р░Чр░╛р░╣р░и & р░Йр░др▒Нр░кр░др▒Нр░др░┐, р░Хр░Вр░кр▒Нр░пр▒Вр░Яр░░р▒Н р░жр▒Гр░╖р▒Нр░Яр░┐ р░ор░░р░┐р░пр▒Б р░Жр░бр░┐р░пр▒Л р░кр░ир▒Бр░▓р░кр▒И р░Ер░зр░┐р░Х р░кр░ир░┐р░др▒Ар░░р▒Б.
|
||||
- р░╡р░┐р░жр▒Нр░пр░╛р░╡р▒Зр░др▒Нр░др░▓р▒Б р░ор░░р░┐р░пр▒Б р░Ер░нр▒Нр░пр░╛р░╕р░Хр▒Бр░▓ р░кр▒Нр░░р░╡р▒Зр░╢р░╛р░ир░┐р░Хр░┐ р░др░Хр▒Нр░Хр▒Бр░╡ р░Ер░╡р░░р▒Лр░зр░В.
|
||||
- р░др▒Жр░▓р▒Бр░╕р▒Бр░Хр▒Лр░╡р░бр░╛р░ир░┐р░Хр░┐ р░Хр▒Зр░╡р░▓р░В р░ор▒Вр░бр▒Б р░др░░р░Чр░др▒Бр░▓р░др▒Л р░Хр▒Кр░ир▒Нр░ир░┐ р░╡р░┐р░ир░┐р░пр▒Лр░Чр░жр░╛р░░р▒Б-р░ор▒Бр░Ц р░╕р░Вр░Чр▒Нр░░р░╣р░гр░▓р▒Б.
|
||||
- р░ор░╛ р░Ер░ир▒Нр░ир░┐ р░кр▒Нр░░р▒Ар░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░В р░Хр▒Лр░╕р░В р░Пр░Хр▒Ар░Хр▒Гр░д API.
|
||||
|
||||
2. р░др░Хр▒Нр░Хр▒Бр░╡ р░Чр░гр░и р░Цр░░р▒Нр░Ър▒Бр░▓р▒Б, р░Ър░┐р░ир▒Нр░и р░Хр░╛р░░р▒Нр░мр░ир▒Н р░кр░╛р░жр░ор▒Бр░жр▒Нр░░:
|
||||
- р░кр░░р░┐р░╢р▒Лр░зр░Хр▒Бр░▓р▒Б р░Ор░▓р▒Нр░▓р░кр▒Нр░кр▒Бр░бр▒В р░ор░│р▒Нр░▓р▒А р░╢р░┐р░Хр▒Нр░╖р░г р░кр▒Кр░Вр░жр▒З р░мр░жр▒Бр░▓р▒Б р░╢р░┐р░Хр▒Нр░╖р░г р░кр▒Кр░Вр░жр░┐р░и р░ир░ор▒Вр░ир░╛р░▓р░ир▒Б р░кр░Вр░Ър▒Бр░Хр▒Лр░╡р░Ър▒Нр░Ър▒Б.
|
||||
- р░Ер░нр▒Нр░пр░╛р░╕р░Хр▒Бр░▓р▒Б р░Чр░гр░и р░╕р░ор░пр░╛р░ир▒Нр░ир░┐ р░ор░░р░┐р░пр▒Б р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Цр░░р▒Нр░Ър▒Бр░▓р░ир▒Б р░др░Чр▒Нр░Чр░┐р░Вр░Ър░Чр░▓р░░р▒Б.
|
||||
- р░Ер░ир▒Нр░ир░┐ р░кр░жр▒Нр░зр░др▒Бр░▓р▒Нр░▓р▒Л 60,000 р░Хр░Вр░Яр▒З р░Ор░Хр▒Нр░Хр▒Бр░╡ р░кр▒Нр░░р▒Ар░Яр▒Нр░░р▒Ир░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р░др▒Л р░бр░Ьр░ир▒Нр░▓ р░Хр▒Кр░жр▒Нр░жр▒А р░Жр░░р▒Нр░Хр░┐р░Яр▒Жр░Хр▒Нр░Ър░░р▒НтАМр░▓р▒Б.
|
||||
|
||||
3. р░ор▒Лр░бр░▓р▒Н р░Ьр▒Ар░╡р░┐р░др░Хр░╛р░▓р░Вр░▓р▒Л р░кр▒Нр░░р░др░┐ р░нр░╛р░Чр░╛р░ир░┐р░Хр░┐ р░╕р░░р▒Ир░и р░лр▒Нр░░р▒Зр░ор▒НтАМр░╡р░░р▒Нр░Хр▒НтАМр░ир▒Б р░Ор░Вр░Ър▒Бр░Хр▒Лр░Вр░бр░┐:
|
||||
- 3 р░▓р▒Ир░ир▒Нр░▓ р░Хр▒Лр░бр▒НтАМр░▓р▒Л р░╕р▒Нр░Яр▒Зр░Яр▒Н р░Жр░лр▒Н р░жр░┐ р░Жр░░р▒Нр░Яр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р░Хр▒Б р░╢р░┐р░Хр▒Нр░╖р░г р░Зр░╡р▒Нр░╡р░Вр░бр░┐.
|
||||
- TF2.0/PyTorch/JAX р░лр▒Нр░░р▒Зр░ор▒НтАМр░╡р░░р▒Нр░Хр▒НтАМр░▓ р░ор░зр▒Нр░п р░Тр░Хр▒З р░ор▒Лр░бр░▓р▒НтАМр░ир▒Б р░Зр░╖р▒Нр░Яр░╛р░ир▒Бр░╕р░╛р░░р░Вр░Чр░╛ р░др░░р░▓р░┐р░Вр░Ър░Вр░бр░┐.
|
||||
- р░╢р░┐р░Хр▒Нр░╖р░г, р░ор▒Вр░▓р▒Нр░пр░╛р░Вр░Хр░ир░В р░ор░░р░┐р░пр▒Б р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Хр▒Лр░╕р░В р░╕р░░р▒Ир░и р░лр▒Нр░░р▒Зр░ор▒НтАМр░╡р░░р▒Нр░Хр▒НтАМр░ир▒Б р░╕р░Ьр░╛р░╡р▒Бр░Чр░╛ р░Ор░Вр░Ър▒Бр░Хр▒Лр░Вр░бр░┐.
|
||||
|
||||
4. р░ор▒А р░Ер░╡р░╕р░░р░╛р░▓р░Хр▒Б р░Ер░ир▒Бр░Чр▒Бр░гр░Вр░Чр░╛ р░ор▒Лр░бр░▓р▒Н р░▓р▒Зр░жр░╛ р░Йр░жр░╛р░╣р░░р░гр░ир▒Б р░╕р▒Бр░▓р░нр░Вр░Чр░╛ р░Ер░ир▒Бр░Хр▒Вр░▓р▒Ар░Хр░░р░┐р░Вр░Ър░Вр░бр░┐:
|
||||
- р░кр▒Нр░░р░др░┐ р░Жр░░р▒Нр░Хр░┐р░Яр▒Жр░Хр▒Нр░Ър░░р▒Н р░жр░╛р░ир░┐ р░Ер░╕р░▓р▒Б р░░р░Ър░пр░┐р░др░▓р▒Б р░кр▒Нр░░р░Ър▒Бр░░р░┐р░Вр░Ър░┐р░и р░лр░▓р░┐р░др░╛р░▓р░ир▒Б р░кр▒Бр░ир░░р▒Бр░др▒Нр░кр░др▒Нр░др░┐ р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ р░ор▒Зр░ор▒Б р░Йр░жр░╛р░╣р░░р░гр░▓р░ир▒Б р░Ер░Вр░жр░┐р░╕р▒Нр░др░╛р░ор▒Б.
|
||||
- р░ор▒Лр░бр░▓р▒Н р░Зр░Вр░Яр░░р▒Нр░ир░▓р▒НтАМр░▓р▒Б р░╡р▒Ар░▓р▒Ир░ир░Вр░д р░╕р▒Нр░ер░┐р░░р░Вр░Чр░╛ р░мр░╣р░┐р░░р▒Нр░Чр░др░ор░╡р▒Бр░др░╛р░пр░┐.
|
||||
- р░╢р▒Ар░Шр▒Нр░░ р░кр▒Нр░░р░пр▒Лр░Чр░╛р░▓ р░Хр▒Лр░╕р░В р░▓р▒Ир░мр▒Нр░░р░░р▒А р░ир▒Бр░Вр░бр░┐ р░╕р▒Нр░╡р░др░Вр░др▒Нр░░р░Вр░Чр░╛ р░ор▒Лр░бр░▓р▒Н р░лр▒Ир░▓р▒НтАМр░▓р░ир▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░╡р░Ър▒Нр░Ър▒Б.
|
||||
|
||||
## р░ир▒Зр░ир▒Б р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░ир▒Б р░Ор░Вр░жр▒Бр░Хр▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░Хр▒Вр░бр░жр▒Б?
|
||||
|
||||
- р░И р░▓р▒Ир░мр▒Нр░░р░░р▒А р░ир▒Нр░пр▒Вр░░р░▓р▒Н р░ир▒Жр░Яр▒НтАМр░▓ р░Хр▒Лр░╕р░В р░мр░┐р░▓р▒Нр░бр░┐р░Вр░Чр▒Н р░мр▒Нр░▓р░╛р░Хр▒НтАМр░▓ р░ор░╛р░бр▒Нр░пр▒Бр░▓р░░р▒Н р░Яр▒Вр░▓р▒НтАМр░мр░╛р░Хр▒Нр░╕р▒Н р░Хр░╛р░жр▒Б. р░ор▒Лр░бр░▓р▒Н р░лр▒Ир░▓р▒НтАМр░▓р░▓р▒Лр░ир░┐ р░Хр▒Лр░бр▒Н р░Йр░жр▒Нр░жр▒Зр░╢р░кр▒Вр░░р▒Нр░╡р░Хр░Вр░Чр░╛ р░Ер░жр░ир░кр▒Б р░╕р░Вр░Чр▒Нр░░р░╣р░гр░▓р░др▒Л р░░р▒Ар░лр▒Нр░пр░╛р░Хр▒Нр░Яр░░р░┐р░Вр░Чр▒Н р░Ър▒Зр░пр░мр░бр░жр▒Б, р░др░жр▒Нр░╡р░╛р░░р░╛ р░кр░░р░┐р░╢р▒Лр░зр░Хр▒Бр░▓р▒Б р░Ер░жр░ир░кр▒Б р░╕р░Вр░Чр▒Нр░░р░╣р░гр░▓р▒Б/р░лр▒Ир░│р▒Нр░▓р░▓р▒Лр░Хр░┐ р░кр▒Нр░░р░╡р▒Зр░╢р░┐р░Вр░Ър░Хр▒Бр░Вр░бр░╛ р░кр▒Нр░░р░др░┐ р░ор▒Лр░бр░▓р▒НтАМр░кр▒И р░др▒Нр░╡р░░р░Чр░╛ р░ор░│р▒Нр░▓р░┐р░Вр░Ър░Чр░▓р░░р▒Б.
|
||||
- р░╢р░┐р░Хр▒Нр░╖р░г API р░П р░ор▒Лр░бр░▓р▒НтАМр░▓р▒Л р░кр░ир░┐ р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ р░Йр░жр▒Нр░жр▒Зр░╢р░┐р░Вр░Ър░мр░бр░▓р▒Зр░жр▒Б р░Хр░╛р░ир▒А р░▓р▒Ир░мр▒Нр░░р░░р▒А р░Ер░Вр░жр░┐р░Вр░Ър░┐р░и р░ор▒Лр░бр░▓р▒НтАМр░▓р░др▒Л р░кр░ир░┐ р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ р░Жр░кр▒Нр░Яр░┐р░ор▒Ир░Ьр▒Н р░Ър▒Зр░пр░мр░бр░┐р░Вр░жр░┐. р░╕р░╛р░зр░╛р░░р░г р░ор▒Жр░╖р░┐р░ир▒Н р░▓р▒Жр░░р▒Нр░ир░┐р░Вр░Чр▒Н р░▓р▒Вр░кр▒НтАМр░▓ р░Хр▒Лр░╕р░В, р░ор▒Ар░░р▒Б р░ор░░р▒Кр░Х р░▓р▒Ир░мр▒Нр░░р░░р▒Ар░ир░┐ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░╛р░▓р░┐ (р░мр░╣р▒Бр░╢р░╛, [Accelerate](https://huggingface.co/docs/accelerate)).
|
||||
- р░ор▒Зр░ор▒Б р░╡р▒Ар░▓р▒Ир░ир░ир▒Нр░ир░┐ р░Ор░Хр▒Нр░Хр▒Бр░╡ р░╡р░┐р░ир░┐р░пр▒Лр░Ч р░╕р░Вр░жр░░р▒Нр░нр░╛р░▓р░ир▒Б р░кр▒Нр░░р░жр░░р▒Нр░╢р░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░кр▒Нр░░р░пр░др▒Нр░ир░┐р░╕р▒Нр░др▒Бр░ир▒Нр░ир░кр▒Нр░кр▒Бр░бр▒Б, р░ор░╛ [р░Йр░жр░╛р░╣р░░р░гр░▓ р░лр▒Лр░▓р▒Нр░бр░░р▒Н](https://github.com/huggingface/transformers/tree/main/examples)р░▓р▒Лр░ир░┐ р░╕р▒Нр░Хр▒Нр░░р░┐р░кр▒Нр░Яр▒НтАМр░▓р▒Б р░Хр▒Зр░╡р░▓р░В: р░Йр░жр░╛р░╣р░░р░гр░▓р▒Б. р░ор▒А р░ир░┐р░░р▒Нр░жр░┐р░╖р▒Нр░Я р░╕р░ор░╕р▒Нр░пр░кр▒И р░Ер░╡р░┐ р░кр░ир░┐ р░Ър▒Зр░пр░╡р▒Б р░ор░░р░┐р░пр▒Б р░╡р░╛р░Яр░┐р░ир░┐ р░ор▒А р░Ер░╡р░╕р░░р░╛р░▓р░Хр▒Б р░Ер░ир▒Бр░Чр▒Бр░гр░Вр░Чр░╛ р░ор░╛р░░р▒Нр░Ър▒Бр░Хр▒Лр░╡р░бр░╛р░ир░┐р░Хр░┐ р░ор▒Ар░░р▒Б р░Хр▒Кр░ир▒Нр░ир░┐ р░Хр▒Лр░бр▒Н р░▓р▒Ир░ир▒НтАМр░▓р░ир▒Б р░ор░╛р░░р▒Нр░Ър░╡р░▓р░╕р░┐ р░Йр░Вр░Яр▒Бр░Вр░жр░┐.
|
||||
|
||||
## р░╕р░Вр░╕р▒Нр░ер░╛р░кр░и
|
||||
|
||||
### р░кр░┐р░кр▒Н р░др▒Л
|
||||
|
||||
р░И р░░р░┐р░кр▒Лр░Ьр░┐р░Яр░░р▒А р░кр▒Ир░ер░╛р░ир▒Н 3.8+, р░лр▒Нр░▓р░╛р░Хр▒Нр░╕р▒Н 0.4.1+, PyTorch 1.10+ р░ор░░р░┐р░пр▒Б TensorFlow 2.6+р░▓р▒Л р░кр░░р▒Ар░Хр▒Нр░╖р░┐р░Вр░Ър░мр░бр░┐р░Вр░жр░┐.
|
||||
|
||||
р░ор▒Ар░░р▒Б [р░╡р░░р▒Нр░Ър▒Бр░╡р░▓р▒Н р░╡р░╛р░др░╛р░╡р░░р░гр░В](https://docs.python.org/3/library/venv.html)р░▓р▒Л ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░ир▒Б р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Н р░Ър▒Зр░пр░╛р░▓р░┐. р░ор▒Ар░Хр▒Б р░кр▒Ир░ер░╛р░ир▒Н р░╡р░░р▒Нр░Ър▒Бр░╡р░▓р▒Н р░кр░░р░┐р░╕р░░р░╛р░▓ р░Чр▒Бр░░р░┐р░Вр░Ър░┐ р░др▒Жр░▓р░┐р░пр░Хр▒Бр░Вр░Яр▒З, [р░пр▒Вр░Ьр░░р▒Н р░Чр▒Ир░бр▒Н](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) р░Ър▒Вр░бр░Вр░бр░┐.
|
||||
|
||||
р░ор▒Бр░Вр░жр▒Бр░Чр░╛, р░ор▒Ар░░р▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░мр▒Лр░др▒Бр░ир▒Нр░и р░кр▒Ир░ер░╛р░ир▒Н р░╡р▒Жр░░р▒Нр░╖р░ир▒НтАМр░др▒Л р░╡р░░р▒Нр░Ър▒Бр░╡р░▓р▒Н р░╡р░╛р░др░╛р░╡р░░р░гр░╛р░ир▒Нр░ир░┐ р░╕р▒Гр░╖р▒Нр░Яр░┐р░Вр░Ър░Вр░бр░┐ р░ор░░р░┐р░пр▒Б р░жр░╛р░ир░┐р░ир░┐ р░╕р░Хр▒Нр░░р░┐р░пр░В р░Ър▒Зр░пр░Вр░бр░┐.
|
||||
|
||||
р░Ер░кр▒Нр░кр▒Бр░бр▒Б, р░ор▒Ар░░р▒Б р░лр▒Нр░▓р░╛р░Хр▒Нр░╕р▒Н, р░кр▒Ир░Яр░╛р░░р▒Нр░Ър▒Н р░▓р▒Зр░жр░╛ р░Яр▒Жр░ир▒Нр░╕р░░р▒НтАМр░лр▒Нр░▓р▒Лр░▓р▒Л р░Хр░ир▒Ар░╕р░В р░Тр░Хр░жр░╛р░ир░┐р░ир░┐ р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Н р░Ър▒Зр░пр░╛р░▓р░┐.
|
||||
р░жр░пр░Ър▒Зр░╕р░┐ [TensorFlow р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Зр░╖р░ир▒Н р░кр▒Зр░Ьр▒А](https://www.tensorflow.org/install/), [PyTorch р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Зр░╖р░ир▒Н р░кр▒Зр░Ьр▒А](https://pytorch.org/get-started/locally/#start-locally) р░ор░░р░┐р░пр▒Б/р░ир░┐ р░Ър▒Вр░бр░Вр░бр░┐ р░▓р▒Зр░жр░╛ р░ор▒А р░кр▒Нр░▓р░╛р░Яр▒НтАМр░лр░╛р░░р░ор▒Н р░Хр▒Лр░╕р░В р░ир░┐р░░р▒Нр░жр░┐р░╖р▒Нр░Я р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Зр░╖р░ир▒Н р░Хр░ор░╛р░Вр░бр▒НтАМр░Хр▒Б р░╕р░Вр░мр░Вр░зр░┐р░Вр░Ър░┐ [Flax](https://github.com/google/flax#quick-install) р░ор░░р░┐р░пр▒Б [Jax](https://github.com/google/jax#installation) р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Зр░╖р░ир▒Н р░кр▒Зр░Ьр▒Ар░▓р▒Б .
|
||||
|
||||
р░Ж р░мр▒Нр░пр░╛р░Хр▒Жр░Вр░бр▒НтАМр░▓р░▓р▒Л р░Тр░Хр░Яр░┐ р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Н р░Ър▒Зр░пр░мр░бр░┐р░ир░кр▒Нр░кр▒Бр░бр▒Б, ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░ир▒Б р░И р░Хр▒Нр░░р░┐р░Вр░жр░┐ р░╡р░┐р░зр░Вр░Чр░╛ р░кр░┐р░кр▒НтАМр░ир░┐ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░┐ р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Н р░Ър▒Зр░пр░╡р░Ър▒Нр░Ър▒Б:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
р░ор▒Ар░░р▒Б р░Йр░жр░╛р░╣р░░р░гр░▓р░др▒Л р░кр▒Нр░▓р▒З р░Ър▒Зр░пр░╛р░▓р░ир▒Бр░Хр▒Бр░Вр░Яр▒З р░▓р▒Зр░жр░╛ р░Хр▒Лр░бр▒Н р░пр▒Кр░Хр▒Нр░Х р░мр▒Нр░▓р▒Ар░бр░┐р░Вр░Чр▒Н р░Ор░бр▒Нр░Ьр▒Н р░Ер░╡р░╕р░░р░В р░ор░░р░┐р░пр▒Б р░Хр▒Кр░др▒Нр░д р░╡р░┐р░бр▒Бр░жр░▓ р░Хр▒Лр░╕р░В р░╡р▒Зр░Ър░┐ р░Йр░Вр░бр░▓р▒Зр░Хр░кр▒Лр░др▒З, р░ор▒Ар░░р▒Б р░др░кр▒Нр░кр░ир░┐р░╕р░░р░┐р░Чр░╛ [р░ор▒Вр░▓р░В р░ир▒Бр░Вр░бр░┐ р░▓р▒Ир░мр▒Нр░░р░░р▒Ар░ир░┐ р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Н р░Ър▒Зр░пр░╛р░▓р░┐](https://huggingface.co/docs/transformers/installation#installing-from-source).
|
||||
|
||||
### р░Хр▒Кр░Вр░бр░╛ р░др▒Л
|
||||
|
||||
р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░╕р▒Н р░╡р▒Жр░░р▒Нр░╖р░ир▒Н v4.0.0 р░ир▒Бр░Вр░бр░┐, р░ор▒Зр░ор▒Б р░Зр░кр▒Нр░кр▒Бр░бр▒Б р░Хр▒Кр░Вр░бр░╛ р░Ыр░╛р░ир▒Жр░▓р▒НтАМр░ир░┐ р░Хр░▓р░┐р░Чр░┐ р░Йр░ир▒Нр░ир░╛р░ор▒Б: `huggingface`.
|
||||
|
||||
ЁЯдЧ р░Хр░┐р░Вр░жр░┐ р░╡р░┐р░зр░Вр░Чр░╛ р░Хр▒Кр░Вр░бр░╛ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░┐ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓р░ир▒Б р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Н р░Ър▒Зр░пр░╡р░Ър▒Нр░Ър▒Б:
|
||||
|
||||
```shell script
|
||||
conda install -c huggingface transformers
|
||||
```
|
||||
|
||||
Flax, PyTorch р░▓р▒Зр░жр░╛ TensorFlow р░пр▒Кр░Хр▒Нр░Х р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Зр░╖р░ир▒Н р░кр▒Зр░Ьр▒Ар░▓р░ир▒Б р░Хр▒Кр░Вр░бр░╛р░др▒Л р░Ор░▓р░╛ р░Зр░ир▒НтАМр░╕р▒Нр░Яр░╛р░▓р▒Н р░Ър▒Зр░пр░╛р░▓р▒Л р░Ър▒Вр░бр░Яр░╛р░ир░┐р░Хр░┐ р░╡р░╛р░Яр░┐р░ир░┐ р░Ер░ир▒Бр░╕р░░р░┐р░Вр░Ър░Вр░бр░┐.
|
||||
|
||||
> **_р░Чр░ор░ир░┐р░Х:_** Windowsр░▓р▒Л, р░Хр░╛р░╖р░┐р░Вр░Чр▒Н р░ир▒Бр░Вр░бр░┐ р░кр▒Нр░░р░пр▒Лр░Ьр░ир░В р░кр▒Кр░Вр░жр▒Зр░Вр░жр▒Бр░Хр▒Б р░ор▒Ар░░р▒Б р░бр▒Жр░╡р░▓р░кр░░р▒Н р░ор▒Лр░бр▒НтАМр░ир░┐ р░╕р░Хр▒Нр░░р░┐р░пр░В р░Ър▒Зр░пр░ор░ир░┐ р░кр▒Нр░░р░╛р░Вр░кр▒Нр░Яр▒Н р░Ър▒Зр░пр░мр░бр░╡р░Ър▒Нр░Ър▒Б. р░Зр░жр░┐ р░ор▒Ар░Хр▒Б р░Ор░Вр░кр░┐р░Х р░Хр░╛р░Хр░кр▒Лр░др▒З, р░жр░пр░Ър▒Зр░╕р░┐ [р░И р░╕р░Вр░Ър░┐р░Х](https://github.com/huggingface/huggingface_hub/issues/1062)р░▓р▒Л р░ор░╛р░Хр▒Б р░др▒Жр░▓р░┐р░пр░Ьр▒Зр░пр░Вр░бр░┐.
|
||||
|
||||
## р░ор▒Лр░бр░▓р▒Н р░Жр░░р▒Нр░Хр░┐р░Яр▒Жр░Хр▒Нр░Ър░░р▒Нр░▓р▒Б
|
||||
|
||||
**[р░Ер░ир▒Нр░ир░┐ р░ор▒Лр░бр░▓р▒Н р░Ър▒Жр░Хр▒НтАМр░кр░╛р░пр░┐р░Вр░Яр▒НтАМр░▓р▒Б](https://huggingface.co/models)** ЁЯдЧ р░Ер░Вр░жр░┐р░Вр░Ър░┐р░и р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░▓р▒Б huggingface.co [model hub](https://huggingface.co/models) р░ир▒Бр░Вр░бр░┐ р░╕р░Ьр░╛р░╡р▒Бр░Чр░╛ р░Пр░Хр▒Ар░Хр▒Гр░др░В р░Ър▒Зр░пр░мр░бр▒Нр░бр░╛р░пр░┐ [users](https://huggingface.co/users) р░ор░░р░┐р░пр▒Б [organizations](https://huggingface.co/organizations) р░жр▒Нр░╡р░╛р░░р░╛ р░ир▒Зр░░р▒Бр░Чр░╛ р░Ер░кр▒НтАМр░▓р▒Лр░бр▒Н р░Ър▒Зр░пр░мр░бр░др░╛р░пр░┐.
|
||||
|
||||
р░кр▒Нр░░р░╕р▒Нр░др▒Бр░д р░др░ир░┐р░Цр▒А р░Хр▒Зр░Вр░жр▒Нр░░р░╛р░▓ р░╕р░Вр░Цр▒Нр░п: 
|
||||
|
||||
ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░▓р▒Б р░кр▒Нр░░р░╕р▒Нр░др▒Бр░др░В р░Хр░┐р░Вр░жр░┐ р░Жр░░р▒Нр░Хр░┐р░Яр▒Жр░Хр▒Нр░Ър░░р▒НтАМр░▓р░ир▒Б р░Ер░Вр░жр░Ьр▒Зр░╕р▒Нр░др▒Бр░ир▒Нр░ир░╛р░пр░┐ (р░╡р░╛р░Яр░┐р░▓р▒Л р░кр▒Нр░░р░др░┐ р░Тр░Хр▒Нр░Хр░Яр░┐ р░Йр░ир▒Нр░ир░д р░╕р▒Нр░ер░╛р░пр░┐ р░╕р░╛р░░р░╛р░Вр░╢р░В р░Хр▒Лр░╕р░В [р░Зр░Хр▒Нр░Хр░б](https://huggingface.co/docs/transformers/model_summary) р░Ър▒Вр░бр░Вр░бр░┐):
|
||||
|
||||
1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
|
||||
1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
|
||||
1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
|
||||
1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
|
||||
1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
|
||||
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
|
||||
1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from ├Йcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
|
||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
|
||||
1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
|
||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
|
||||
1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
|
||||
1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
|
||||
1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
|
||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Su├бrez*, Yoann Dupont, Laurent Romary, ├Йric Villemonte de la Clergerie, Djam├й Seddah and Beno├оt Sagot.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of G├╢ttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo L├╝ddecke and Alexander Ecker.
|
||||
1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozi├иre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J├йr├йmy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre D├йfossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
|
||||
1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
|
||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
|
||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
|
||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
|
||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Herv├й J├йgou.
|
||||
1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
|
||||
1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Kr├дhenb├╝hl.
|
||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||
1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
|
||||
1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timoth├йe Darcet, Th├йo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Herv├й Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
|
||||
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
|
||||
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
|
||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas O─Яuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by Ren├й Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
|
||||
1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
|
||||
1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
|
||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre D├йfossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
|
||||
1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
|
||||
1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
|
||||
1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
|
||||
1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Lo├пc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Beno├оt Crabb├й, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey ├Цhman, Fredrik Carlsson, Magnus Sahlgren.
|
||||
1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo Garc├нa del R├нo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
|
||||
1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
|
||||
1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
|
||||
1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
|
||||
1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
|
||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
|
||||
1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Lauren├зon, Lucile Saulnier, L├йo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
|
||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Herv├й J├йgou, Matthijs Douze.
|
||||
1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
|
||||
1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth├йe Lacroix, Baptiste Rozi├иre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
|
||||
1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by J├╢rg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
|
||||
1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
|
||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
|
||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
|
||||
1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
|
||||
1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
|
||||
1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
|
||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre D├йfossez.
|
||||
1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
|
||||
1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei NoahтАЩs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
|
||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
|
||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K├╝ttler, Mike Lewis, Wen-tau Yih, Tim Rockt├дschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, ┼Бukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Doll├бr.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault F├йvry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of W├╝rzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Pawe┼В Krzysztof Nowak, Thomas M├╝ller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
|
||||
1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
|
||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
|
||||
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
|
||||
1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
|
||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
|
||||
1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
|
||||
1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
|
||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzm├бn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||
1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
|
||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [тАЛXLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. р░Хр▒Кр░др▒Нр░д р░ор▒Лр░бр░▓р▒НтАМр░ир▒Б р░Ер░Вр░жр░┐р░Вр░Ър░╛р░▓р░ир▒Бр░Хр▒Бр░Вр░Яр▒Бр░ир▒Нр░ир░╛р░░р░╛? р░Хр▒Кр░др▒Нр░д р░ор▒Лр░бр░▓р▒НтАМр░ир▒Б р░Ьр▒Лр░бр░┐р░Вр░Ър▒З р░кр▒Нр░░р░Хр▒Нр░░р░┐р░пр░▓р▒Л р░ор▒Ар░Хр▒Б р░ор░╛р░░р▒Нр░Чр░ир░┐р░░р▒Нр░жр▒Зр░╢р░В р░Ър▒Зр░╕р▒Зр░Вр░жр▒Бр░Хр▒Б р░ор▒Зр░ор▒Б **р░╡р░┐р░╡р░░р░гр░╛р░др▒Нр░ор░Х р░Чр▒Ир░бр▒Н р░ор░░р░┐р░пр▒Б р░Яр▒Жр░Вр░кр▒Нр░▓р▒Зр░Яр▒НтАМр░▓р░ир▒Б** р░Ьр▒Лр░бр░┐р░Вр░Ър░╛р░ор▒Б. р░ор▒Ар░░р▒Б р░╡р░╛р░Яр░┐р░ир░┐ р░░р░┐р░кр▒Лр░Ьр░┐р░Яр░░р▒А р░пр▒Кр░Хр▒Нр░Х [`р░Яр▒Жр░Вр░кр▒Нр░▓р▒Зр░Яр▒Нр░▓р▒Б`](./р░Яр▒Жр░Вр░кр▒Нр░▓р▒Зр░Яр▒Нр░▓р▒Б) р░лр▒Лр░▓р▒Нр░бр░░р▒НтАМр░▓р▒Л р░Хр░ир▒Бр░Чр▒Кр░ир░╡р░Ър▒Нр░Ър▒Б. р░ор▒А PRр░ир░┐ р░кр▒Нр░░р░╛р░░р░Вр░нр░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░ор▒Бр░Вр░жр▒Б [р░╕р░╣р░Хр░╛р░░ р░ор░╛р░░р▒Нр░Чр░жр░░р▒Нр░╢р░Хр░╛р░▓р▒Б](./CONTRIBUTING.md)р░ир░┐ р░др░ир░┐р░Цр▒А р░Ър▒Зр░╕р░┐, р░ир░┐р░░р▒Нр░╡р░╣р░гр░жр░╛р░░р▒Бр░▓р░ир▒Б р░╕р░Вр░кр▒Нр░░р░жр░┐р░Вр░Ър░Вр░бр░┐ р░▓р▒Зр░жр░╛ р░Ер░нр░┐р░кр▒Нр░░р░╛р░пр░╛р░ир▒Нр░ир░┐ р░╕р▒Зр░Хр░░р░┐р░Вр░Ър░бр░╛р░ир░┐р░Хр░┐ р░╕р░ор░╕р▒Нр░пр░ир▒Б р░др▒Жр░░р░╡р░Вр░бр░┐.
|
||||
|
||||
р░кр▒Нр░░р░др░┐ р░ор▒Лр░бр░▓р▒Н р░лр▒Нр░▓р░╛р░Хр▒Нр░╕р▒Н, р░кр▒Ир░Яр░╛р░░р▒Нр░Ър▒Н р░▓р▒Зр░жр░╛ р░Яр▒Жр░ир▒Нр░╕р░░р▒НтАМр░лр▒Нр░▓р▒Лр░▓р▒Л р░Ер░ор░▓р▒Б р░Ър▒Зр░пр░мр░бр░┐р░Вр░жр░╛ р░▓р▒Зр░жр░╛ ЁЯдЧ Tokenizers р░▓р▒Ир░мр▒Нр░░р░░р▒А р░жр▒Нр░╡р░╛р░░р░╛ р░Ер░ир▒Бр░мр░Вр░зр░┐р░Вр░Ър░мр░бр░┐р░и р░Яр▒Лр░Хр▒Жр░ир▒Ир░Ьр░░р▒НтАМр░ир░┐ р░Хр░▓р░┐р░Чр░┐ р░Йр░Вр░жр▒Л р░▓р▒Зр░жр▒Л р░др░ир░┐р░Цр▒А р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐, [р░И р░кр░Яр▒Нр░Яр░┐р░Х](https://huggingface.co/docs/transformers/index#supported-frameworks).
|
||||
|
||||
р░И р░Ер░ор░▓р▒Бр░▓р▒Б р░Ер░ир▒Зр░Х р░бр▒Зр░Яр░╛р░╕р▒Жр░Яр▒НтАМр░▓р░▓р▒Л р░кр░░р▒Ар░Хр▒Нр░╖р░┐р░Вр░Ър░мр░бр▒Нр░бр░╛р░пр░┐ (р░Йр░жр░╛р░╣р░░р░г р░╕р▒Нр░Хр▒Нр░░р░┐р░кр▒Нр░Яр▒НтАМр░▓р░ир▒Б р░Ър▒Вр░бр░Вр░бр░┐) р░ор░░р░┐р░пр▒Б р░Ер░╕р░▓р▒Ир░и р░Ер░ор░▓р▒Бр░▓ р░кр░ир░┐р░др▒Ар░░р▒Бр░др▒Л р░╕р░░р░┐р░кр▒Лр░▓р░╛р░▓р░┐. р░ор▒Ар░░р▒Б [р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Зр░╖р░ир▒Н](https://github.com/huggingface/transformers/tree/main/examples) р░пр▒Кр░Хр▒Нр░Х р░Йр░жр░╛р░╣р░░р░гр░▓ р░╡р░┐р░нр░╛р░Чр░Вр░▓р▒Л р░кр░ир░┐р░др▒Ар░░р▒Бр░кр▒И р░ор░░р░┐р░ир▒Нр░ир░┐ р░╡р░┐р░╡р░░р░╛р░▓р░ир▒Б р░Хр░ир▒Бр░Чр▒Кр░ир░╡р░Ър▒Нр░Ър▒Б.
|
||||
|
||||
## р░Зр░Вр░Хр░╛ р░ир▒Зр░░р▒Нр░Ър▒Бр░Хр▒Л
|
||||
|
||||
| р░╡р░┐р░нр░╛р░Чр░В | р░╡р░┐р░╡р░░р░г |
|
||||
|-|-|
|
||||
| [р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Зр░╖р░ир▒Н](https://huggingface.co/docs/transformers/) | р░кр▒Вр░░р▒Нр░др░┐ API р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Зр░╖р░ир▒Н р░ор░░р░┐р░пр▒Б р░Яр▒Нр░пр▒Бр░Яр▒Лр░░р░┐р░пр░▓р▒Нр░╕р▒Н |
|
||||
| [р░Яр░╛р░╕р▒Нр░Хр▒Н р░╕р░╛р░░р░╛р░Вр░╢р░В](https://huggingface.co/docs/transformers/task_summary) | ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒НтАМр░▓ р░жр▒Нр░╡р░╛р░░р░╛ р░╕р░кр▒Лр░░р▒Нр░Яр▒Н р░Ър▒Зр░пр░мр░бр░┐р░и р░╡р░┐р░зр▒Бр░▓р▒Б |
|
||||
| [р░кр▒Нр░░р▒Ар░кр▒Нр░░р░╛р░╕р▒Жр░╕р░┐р░Вр░Чр▒Н р░Яр▒Нр░пр▒Бр░Яр▒Лр░░р░┐р░пр░▓р▒Н](https://huggingface.co/docs/transformers/preprocessing) | р░ор▒Лр░бр░▓р▒НтАМр░▓ р░Хр▒Лр░╕р░В р░бр▒Зр░Яр░╛р░ир▒Б р░╕р░┐р░жр▒Нр░зр░В р░Ър▒Зр░пр░бр░╛р░ир░┐р░Хр░┐ `Tokenizer` р░Хр▒Нр░▓р░╛р░╕р▒НтАМр░ир░┐ р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░В |
|
||||
| [р░Яр▒Нр░░р▒Ир░ир░┐р░Вр░Чр▒Н р░ор░░р░┐р░пр▒Б р░лр▒Ир░ир▒Н-р░Яр▒Нр░пр▒Вр░ир░┐р░Вр░Чр▒Н](https://huggingface.co/docs/transformers/training) | PyTorch/TensorFlow р░Яр▒Нр░░р▒Ир░ир░┐р░Вр░Чр▒Н р░▓р▒Вр░кр▒Н р░ор░░р░┐р░пр▒Б `Trainer` APIр░▓р▒Л ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░▓р▒Б р░Ер░Вр░жр░┐р░Вр░Ър░┐р░и р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░Йр░кр░пр▒Лр░Чр░┐р░Вр░Ър░бр░В |
|
||||
| [р░др▒Нр░╡р░░р░┐р░д р░кр░░р▒Нр░пр░Яр░и: р░лр▒Ир░ир▒Н-р░Яр▒Нр░пр▒Вр░ир░┐р░Вр░Чр▒Н/р░пр▒Вр░╕р▒Зр░Ьр▒Н р░╕р▒Нр░Хр▒Нр░░р░┐р░кр▒Нр░Яр▒НтАМр░▓р▒Б](https://github.com/huggingface/transformers/tree/main/examples) | р░╡р░┐р░╕р▒Нр░др▒Гр░д р░╢р▒Нр░░р▒Зр░гр░┐ р░Яр░╛р░╕р▒Нр░Хр▒НтАМр░▓р░кр▒И р░лр▒Ир░ир▒Н-р░Яр▒Нр░пр▒Вр░ир░┐р░Вр░Чр▒Н р░ор▒Лр░бр░▓р▒Нр░╕р▒Н р░Хр▒Лр░╕р░В р░Йр░жр░╛р░╣р░░р░г р░╕р▒Нр░Хр▒Нр░░р░┐р░кр▒Нр░Яр▒НтАМр░▓р▒Б |
|
||||
| [р░ор▒Лр░бр░▓р▒Н р░нр░╛р░Чр░╕р▒Нр░╡р░╛р░ор▒Нр░пр░В р░ор░░р░┐р░пр▒Б р░Ер░кр▒НтАМр░▓р▒Лр░бр▒Н р░Ър▒Зр░пр░бр░В](https://huggingface.co/docs/transformers/model_sharing) | р░Хр░ор▒Нр░пр▒Вр░ир░┐р░Яр▒Ар░др▒Л р░ор▒А р░лр▒Ир░ир▒Н-р░Яр▒Нр░пр▒Вр░ир▒Нр░бр▒Н р░ор▒Лр░бр░▓р▒НтАМр░▓р░ир▒Б р░Ер░кр▒НтАМр░▓р▒Лр░бр▒Н р░Ър▒Зр░пр░Вр░бр░┐ р░ор░░р░┐р░пр▒Б р░нр░╛р░Чр░╕р▒Нр░╡р░╛р░ор▒Нр░пр░В р░Ър▒Зр░пр░Вр░бр░┐ |
|
||||
|
||||
## р░Ер░ир▒Бр░▓р▒Зр░Цр░ир░В
|
||||
|
||||
ЁЯдЧ р░Яр▒Нр░░р░╛р░ир▒Нр░╕р▒НтАМр░лр░╛р░░р▒Нр░ор░░р▒Нр░╕р▒Н р░▓р▒Ир░мр▒Нр░░р░░р▒А р░Хр▒Лр░╕р░В р░ор▒Ар░░р▒Б р░Йр░жр░╣р░░р░┐р░Вр░Ър░Чр░▓ [р░кр▒Зр░кр░░р▒Н](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) р░Зр░кр▒Нр░кр▒Бр░бр▒Б р░ор░╛ р░╡р░жр▒Нр░ж р░Йр░Вр░жр░┐:
|
||||
```bibtex
|
||||
@inproceedings{wolf-etal-2020-transformers,
|
||||
title = "Transformers: State-of-the-Art Natural Language Processing",
|
||||
author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R├йmi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
|
||||
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
|
||||
month = oct,
|
||||
year = "2020",
|
||||
address = "Online",
|
||||
publisher = "Association for Computational Linguistics",
|
||||
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
|
||||
pages = "38--45"
|
||||
}
|
||||
```
|
||||
@ -43,7 +43,7 @@ checkpoint: цгАцЯечВ╣
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||
<br>
|
||||
<p>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
@ -72,7 +72,8 @@ checkpoint: цгАцЯечВ╣
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a>
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
@ -311,6 +312,7 @@ conda install -c huggingface transformers
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (цЭешЗк Google Research) ф╝┤щЪПшо║цЦЗ [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) чФ▒ James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon хПСх╕ГуАВ
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (цЭешЗк Microsoft Research) ф╝┤щЪПшо║цЦЗ [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) чФ▒ Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao хПСх╕ГуАВ
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (цЭешЗк CMU/Google Brain) ф╝┤щЪПшо║цЦЗ [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) чФ▒ Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le хПСх╕ГуАВ
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (цЭешЗк ADEPT) ф╝┤щЪПшо║цЦЗ [blog post](https://www.adept.ai/blog/fuyu-8b чФ▒ Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar хПСх╕ГуАВ)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (цЭешЗк Microsoft Research) ф╝┤щЪПшо║цЦЗ [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) чФ▒ Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang хПСх╕ГуАВ
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (цЭешЗк KAIST) ф╝┤щЪПшо║цЦЗ [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) чФ▒ Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim хПСх╕ГуАВ
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (цЭешЗк OpenAI) ф╝┤щЪПшо║цЦЗ [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) чФ▒ Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever хПСх╕ГуАВ
|
||||
@ -332,6 +334,7 @@ conda install -c huggingface transformers
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (цЭешЗк Salesforce) ф╝┤щЪПшо║цЦЗ [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) чФ▒ Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi хПСх╕ГуАВ
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (цЭешЗк Microsoft Research Asia) ф╝┤щЪПшо║цЦЗ [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) чФ▒ Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou хПСх╕ГуАВ
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (цЭешЗк Microsoft Research Asia) ф╝┤щЪПшо║цЦЗ [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) чФ▒ Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou хПСх╕ГуАВ
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (цЭешЗк Microsoft Research Asia) ф╝┤щЪПшо║цЦЗ [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) чФ▒ Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei хПСх╕ГуАВ
|
||||
@ -358,6 +361,7 @@ conda install -c huggingface transformers
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (цЭешЗк NVIDIA) ф╝┤щЪПшо║цЦЗ [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) чФ▒ Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro хПСх╕ГуАВ
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (цЭешЗк NVIDIA) ф╝┤щЪПшо║цЦЗ [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) чФ▒ Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro хПСх╕ГуАВ
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (цЭешЗк Alibaba Research) ф╝┤щЪПшо║цЦЗ [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) чФ▒ Peng Wang, Cheng Da, and Cong Yao хПСх╕ГуАВ
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed..
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (цЭешЗк Studio Ousia) ф╝┤щЪПшо║цЦЗ [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) чФ▒ Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka хПСх╕ГуАВ
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (цЭешЗк Facebook) ф╝┤щЪПшо║цЦЗ [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) чФ▒ Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli хПСх╕ГуАВ
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (цЭешЗк CMU/Google Brain) ф╝┤щЪПшо║цЦЗ [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) чФ▒ Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou хПСх╕ГуАВ
|
||||
@ -375,15 +379,17 @@ conda install -c huggingface transformers
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (цЭешЗкхНОф╕║шп║ф║ЪцЦ╣шИЯхоЮщкМход) ф╝┤щЪПшо║цЦЗ [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) чФ▒ Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu хПСх╕ГуАВ
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (цЭешЗк Meta) ф╝┤щЪПшо║цЦЗ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) чФ▒ the NLLB team хПСх╕ГуАВ
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (цЭешЗк Meta) ф╝┤щЪПшо║цЦЗ [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) чФ▒ the NLLB team хПСх╕ГуАВ
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (цЭешЗк Meta AI) ф╝┤щЪПшо║цЦЗ [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) чФ▒ Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic хПСх╕ГуАВ
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (цЭешЗк the University of Wisconsin - Madison) ф╝┤щЪПшо║цЦЗ [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) чФ▒ Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh хПСх╕ГуАВ
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (цЭешЗк SHI Labs) ф╝┤щЪПшо║цЦЗ [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) чФ▒ Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi хПСх╕ГуАВ
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (цЭешЗк [s-JoL](https://huggingface.co/s-JoL)) чФ▒ [Open-Llama](https://github.com/s-JoL/Open-Llama) хПСх╕Г.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (цЭешЗк [s-JoL](https://huggingface.co/s-JoL)) чФ▒ GitHub (чО░х╖▓хИащЩд).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (цЭешЗк Meta AI) ф╝┤щЪПшо║цЦЗ [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) чФ▒ Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al хПСх╕ГуАВ
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (цЭешЗк Google AI) ф╝┤щЪПшо║цЦЗ [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) чФ▒ Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby хПСх╕ГуАВ
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (цЭешЗк Google AI) ф╝┤щЪПшо║цЦЗ [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) чФ▒ Matthias Minderer, Alexey Gritsenko, Neil Houlsby хПСх╕ГуАВ
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (цЭешЗк Google) ф╝┤щЪПшо║цЦЗ [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) чФ▒ Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu хПСх╕ГуАВ
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (цЭешЗк Google) ф╝┤щЪПшо║цЦЗ [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) чФ▒ Jason Phang, Yao Zhao, Peter J. Liu хПСх╕ГуАВ
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (цЭешЗк Deepmind) ф╝┤щЪПшо║цЦЗ [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) чФ▒ Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira хПСх╕ГуАВ
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (цЭешЗк ADEPT) ф╝┤щЪПшо║цЦЗ [blog post](https://www.adept.ai/blog/persimmon-8b) чФ▒ Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani хПСх╕ГуАВ
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (цЭешЗк ADEPT) ф╝┤щЪПшо║цЦЗ [blog post](https://www.adept.ai/blog/persimmon-8b) чФ▒ Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani хПСх╕ГуАВ
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (цЭешЗк VinAI Research) ф╝┤щЪПшо║цЦЗ [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) чФ▒ Dat Quoc Nguyen and Anh Tuan Nguyen хПСх╕ГуАВ
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (цЭешЗк Google) ф╝┤щЪПшо║цЦЗ [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) чФ▒ Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova хПСх╕ГуАВ
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (цЭешЗк UCLA NLP) ф╝┤щЪПшо║цЦЗ [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) чФ▒ Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang хПСх╕ГуАВ
|
||||
@ -403,6 +409,7 @@ conda install -c huggingface transformers
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (цЭешЗк WeChatAI), ф╝┤щЪПшо║цЦЗ [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) чФ▒ HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou хПСх╕ГуАВ
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (цЭешЗк ZhuiyiTechnology), ф╝┤щЪПшо║цЦЗ [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) чФ▒ Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu хПСх╕ГуАВ
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (цЭешЗк Bo Peng) ф╝┤щЪПшо║цЦЗ [this repo](https://github.com/BlinkDL/RWKV-LM) чФ▒ Bo Peng хПСх╕ГуАВ
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (цЭешЗк NVIDIA) ф╝┤щЪПшо║цЦЗ [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) чФ▒ Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo хПСх╕ГуАВ
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (цЭешЗк Meta AI) ф╝┤щЪПшо║цЦЗ [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) чФ▒ Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick хПСх╕ГуАВ
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (цЭешЗк ASAPP) ф╝┤щЪПшо║цЦЗ [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) чФ▒ Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi хПСх╕ГуАВ
|
||||
@ -441,7 +448,7 @@ conda install -c huggingface transformers
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (цЭешЗк Google AI) ф╝┤щЪПшо║цЦЗ [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) чФ▒ Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby хПСх╕ГуАВ
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (цЭешЗк Meta AI) ф╝┤щЪПшо║цЦЗ [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) чФ▒ Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He хПСх╕ГуАВ
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (цЭешЗк Meta AI) ф╝┤щЪПшо║цЦЗ [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) чФ▒ Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick хПСх╕ГуАВ
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (цЭешЗк HUST-VL) ф╝┤щЪПшо║цЦЗ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) чФ▒ Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang хПСх╕ГуАВ
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (цЭешЗк HUST-VL) ф╝┤щЪПшо║цЦЗ [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) чФ▒ Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang хПСх╕ГуАВ
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (цЭешЗк Meta AI) ф╝┤щЪПшо║цЦЗ [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas хПСх╕Г.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (цЭешЗк Kakao Enterprise) ф╝┤щЪПшо║цЦЗ [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) чФ▒ Jaehyeon Kim, Jungil Kong, Juhee Son хПСх╕ГуАВ
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (цЭешЗк Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) чФ▒ Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
|
||||
@ -55,7 +55,7 @@ user: ф╜┐чФишАЕ
|
||||
<br>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||
<br>
|
||||
<p>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://circleci.com/gh/huggingface/transformers">
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
|
||||
@ -84,7 +84,8 @@ user: ф╜┐чФишАЕ
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Espa├▒ol</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">цЧецЬмшкЮ</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">рд╣рд┐рдиреНрджреА</a>
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers//blob/main/README_te.md">р░др▒Жр░▓р▒Бр░Чр▒Б</a> |
|
||||
</p>
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
@ -323,6 +324,7 @@ conda install -c huggingface transformers
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
|
||||
1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
@ -344,6 +346,7 @@ conda install -c huggingface transformers
|
||||
1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
|
||||
1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
@ -370,6 +373,7 @@ conda install -c huggingface transformers
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
|
||||
1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed..
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
@ -387,15 +391,17 @@ conda install -c huggingface transformers
|
||||
1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei NoahтАЩs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
|
||||
1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
|
||||
1. **[Nystr├╢mformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/main/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released with the paper [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
@ -415,6 +421,7 @@ conda install -c huggingface transformers
|
||||
1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
@ -453,7 +460,7 @@ conda install -c huggingface transformers
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/main/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
|
||||
1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
|
||||
1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
|
||||
6
SECURITY.md
Normal file
6
SECURITY.md
Normal file
@ -0,0 +1,6 @@
|
||||
# Security Policy
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
ЁЯдЧ We have our bug bounty program set up with HackerOne. Please feel free to submit vulnerability reports to our private program at https://hackerone.com/hugging_face.
|
||||
Note that you'll need to be invited to our program, so send us a quick email at security@huggingface.co if you've found a vulnerability.
|
||||
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
|
||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||
# to be used as arguments for docker build (so far).
|
||||
|
||||
ARG PYTORCH='2.0.1'
|
||||
ARG PYTORCH='2.1.0'
|
||||
# (not always a valid torch version)
|
||||
ARG INTEL_TORCH_EXT='1.11.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
@ -55,6 +55,9 @@ RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://hu
|
||||
# Add einops for additional model testing
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
||||
# Add autoawq for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir autoawq
|
||||
|
||||
# For bettertransformer + gptq
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
|
||||
|
||||
|
||||
@ -11,7 +11,6 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
|
||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
|
||||
RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
|
||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||
|
||||
# Test if the image could successfully build the doc. before publishing the image
|
||||
|
||||
@ -4,7 +4,7 @@ LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG PYTORCH='2.0.1'
|
||||
ARG PYTORCH='2.1.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu118'
|
||||
|
||||
@ -36,7 +36,8 @@ RUN python3 -m pip uninstall -y torch-tensorrt
|
||||
RUN python3 -m pip uninstall -y apex
|
||||
RUN git clone https://github.com/NVIDIA/apex
|
||||
# `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
|
||||
RUN cd apex && git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
|
||||
# TODO: check if there is alternative way to install latest apex
|
||||
# RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
|
||||
|
||||
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
||||
RUN python3 -m pip uninstall -y deepspeed
|
||||
|
||||
@ -9,10 +9,9 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||
|
||||
ARG REF=main
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||
|
||||
# If set to nothing, will install the latest version
|
||||
ARG PYTORCH='2.0.1'
|
||||
ARG PYTORCH='2.1.0'
|
||||
ARG TORCH_VISION=''
|
||||
ARG TORCH_AUDIO=''
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
@ -22,6 +21,8 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch';
|
||||
RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||
|
||||
RUN python3 -m pip uninstall -y tensorflow flax
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
|
||||
|
||||
@ -15,8 +15,28 @@
|
||||
title: Vorverarbeiten
|
||||
- local: training
|
||||
title: Optimierung eines vortrainierten Modells
|
||||
- local: run_scripts
|
||||
title: Trainieren mit einem Skript
|
||||
- local: accelerate
|
||||
title: Verteiltes Training mit ЁЯдЧ Accelerate
|
||||
- local: peft
|
||||
title: Laden und Trainieren von Adaptern mit ЁЯдЧ PEFT
|
||||
- local: model_sharing
|
||||
title: Ein Modell teilen
|
||||
- local: transformers_agents
|
||||
title: Agents
|
||||
- local: llm_tutorial
|
||||
title: Generation with LLMs
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- local: add_new_model
|
||||
title: Wie f├╝gt man ein Modell zu ЁЯдЧ Transformers hinzu?
|
||||
- local: add_tensorflow_model
|
||||
title: Wie konvertiert man ein ЁЯдЧ Transformers-Modell in TensorFlow?
|
||||
- local: add_new_pipeline
|
||||
title: Wie f├╝gt man eine Pipeline zu ЁЯдЧ Transformers hinzu?
|
||||
- local: testing
|
||||
title: Testen
|
||||
- local: pr_checks
|
||||
title: ├Ьberpr├╝fung einer Pull Request
|
||||
title: Contribute
|
||||
895
docs/source/de/add_new_model.md
Normal file
895
docs/source/de/add_new_model.md
Normal file
@ -0,0 +1,895 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Wie kann ich ein Modell zu ЁЯдЧ Transformers hinzuf├╝gen?
|
||||
|
||||
Die ЁЯдЧ Transformers-Bibliothek ist dank der Beitr├дge der Community oft in der Lage, neue Modelle anzubieten. Aber das kann ein anspruchsvolles Projekt sein und erfordert eine eingehende Kenntnis der ЁЯдЧ Transformers-Bibliothek und des zu implementierenden Modells. Bei Hugging Face versuchen wir, mehr Mitgliedern der Community die M├╢glichkeit zu geben, aktiv Modelle hinzuzuf├╝gen, und wir haben diese Anleitung zusammengestellt, die Sie durch den Prozess des Hinzuf├╝gens eines PyTorch-Modells f├╝hrt (stellen Sie sicher, dass Sie [PyTorch installiert haben](https://pytorch.org/get-started/locally/)).
|
||||
|
||||
<Tip>
|
||||
|
||||
Wenn Sie daran interessiert sind, ein TensorFlow-Modell zu implementieren, werfen Sie einen Blick in die Anleitung [How to convert a ЁЯдЧ Transformers model to TensorFlow](add_tensorflow_model)!
|
||||
|
||||
</Tip>
|
||||
|
||||
Auf dem Weg dorthin, werden Sie:
|
||||
|
||||
- Einblicke in bew├дhrte Open-Source-Verfahren erhalten
|
||||
- die Konstruktionsprinzipien hinter einer der beliebtesten Deep-Learning-Bibliotheken verstehen
|
||||
- lernen Sie, wie Sie gro├Яe Modelle effizient testen k├╢nnen
|
||||
- lernen Sie, wie Sie Python-Hilfsprogramme wie `black`, `ruff` und `make fix-copies` integrieren, um sauberen und lesbaren Code zu gew├дhrleisten
|
||||
|
||||
Ein Mitglied des Hugging Face-Teams wird Ihnen dabei zur Seite stehen, damit Sie nicht alleine sind. ЁЯдЧ тЭдя╕П
|
||||
|
||||
Um loszulegen, ├╢ffnen Sie eine [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) Ausgabe f├╝r das Modell, das Sie in ЁЯдЧ Transformers sehen m├╢chten. Wenn Sie nicht besonders w├дhlerisch sind, wenn es darum geht, ein bestimmtes Modell beizusteuern, k├╢nnen Sie nach dem [New model label](https://github.com/huggingface/transformers/labels/New%20model) filtern, um zu sehen, ob es noch unbeanspruchte Modellanfragen gibt, und daran arbeiten.
|
||||
|
||||
Sobald Sie eine neue Modellanfrage er├╢ffnet haben, sollten Sie sich zun├дchst mit ЁЯдЧ Transformers vertraut machen, falls Sie das noch nicht sind!
|
||||
|
||||
## Allgemeiner ├Ьberblick ├╝ber ЁЯдЧ Transformers
|
||||
|
||||
Zun├дchst sollten Sie sich einen allgemeinen ├Ьberblick ├╝ber ЁЯдЧ Transformers verschaffen. ЁЯдЧ Transformers ist eine sehr meinungsfreudige Bibliothek, es ist also m├╢glich, dass
|
||||
Es besteht also die M├╢glichkeit, dass Sie mit einigen der Philosophien oder Designentscheidungen der Bibliothek nicht einverstanden sind. Aus unserer Erfahrung heraus haben wir jedoch
|
||||
dass die grundlegenden Designentscheidungen und Philosophien der Bibliothek entscheidend sind, um ЁЯдЧ Transformers effizient zu skalieren.
|
||||
Transformatoren zu skalieren und gleichzeitig die Wartungskosten auf einem vern├╝nftigen Niveau zu halten.
|
||||
|
||||
Ein guter erster Ansatzpunkt, um die Bibliothek besser zu verstehen, ist die Lekt├╝re der [Dokumentation unserer Philosophie](Philosophie). Als Ergebnis unserer Arbeitsweise gibt es einige Entscheidungen, die wir versuchen, auf alle Modelle anzuwenden:
|
||||
|
||||
- Komposition wird im Allgemeinen gegen├╝ber Abstraktion bevorzugt
|
||||
- Die Duplizierung von Code ist nicht immer schlecht, wenn sie die Lesbarkeit oder Zug├дnglichkeit eines Modells stark verbessert
|
||||
- Modelldateien sind so in sich geschlossen wie m├╢glich, so dass Sie, wenn Sie den Code eines bestimmten Modells lesen, idealerweise nur
|
||||
in die entsprechende Datei `modeling_....py` schauen m├╝ssen.
|
||||
|
||||
Unserer Meinung nach ist der Code der Bibliothek nicht nur ein Mittel, um ein Produkt bereitzustellen, *z.B.* die M├╢glichkeit, BERT f├╝r
|
||||
Inferenz zu verwenden, sondern auch als das Produkt selbst, das wir verbessern wollen. Wenn Sie also ein Modell hinzuf├╝gen, ist der Benutzer nicht nur die
|
||||
Person, die Ihr Modell verwenden wird, sondern auch jeder, der Ihren Code liest, zu verstehen versucht und ihn m├╢glicherweise verbessert.
|
||||
|
||||
Lassen Sie uns daher ein wenig tiefer in das allgemeine Design der Bibliothek einsteigen.
|
||||
|
||||
### ├Ьberblick ├╝ber die Modelle
|
||||
|
||||
Um ein Modell erfolgreich hinzuzuf├╝gen, ist es wichtig, die Interaktion zwischen Ihrem Modell und seiner Konfiguration zu verstehen,
|
||||
[`PreTrainedModel`] und [`PretrainedConfig`]. Als Beispiel werden wir
|
||||
das Modell, das zu ЁЯдЧ Transformers hinzugef├╝gt werden soll, `BrandNewBert` nennen.
|
||||
|
||||
Schauen wir uns das mal an:
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
|
||||
|
||||
Wie Sie sehen, machen wir in ЁЯдЧ Transformers von der Vererbung Gebrauch, aber wir beschr├дnken die Abstraktionsebene auf ein absolutes Minimum.
|
||||
Minimum. Es gibt nie mehr als zwei Abstraktionsebenen f├╝r ein Modell in der Bibliothek. `BrandNewBertModel`
|
||||
erbt von `BrandNewBertPreTrainedModel`, das wiederum von [`PreTrainedModel`] erbt und
|
||||
das war's. In der Regel wollen wir sicherstellen, dass ein neues Modell nur von
|
||||
[`PreTrainedModel`] abh├дngt. Die wichtigen Funktionalit├дten, die jedem neuen Modell automatisch zur Verf├╝gung gestellt werden, sind
|
||||
Modell automatisch bereitgestellt werden, sind [`~PreTrainedModel.from_pretrained`] und
|
||||
[`~PreTrainedModel.save_pretrained`], die f├╝r die Serialisierung und Deserialisierung verwendet werden. Alle
|
||||
anderen wichtigen Funktionalit├дten, wie `BrandNewBertModel.forward` sollten vollst├дndig in der neuen
|
||||
Skript `modeling_brand_new_bert.py` definiert werden. Als n├дchstes wollen wir sicherstellen, dass ein Modell mit einer bestimmten Kopfebene, wie z.B.
|
||||
`BrandNewBertForMaskedLM` nicht von `BrandNewBertModel` erbt, sondern `BrandNewBertModel` verwendet
|
||||
als Komponente, die im Forward Pass aufgerufen werden kann, um die Abstraktionsebene niedrig zu halten. Jedes neue Modell erfordert eine
|
||||
Konfigurationsklasse, genannt `BrandNewBertConfig`. Diese Konfiguration wird immer als ein Attribut in
|
||||
[PreTrainedModel] gespeichert und kann daher ├╝ber das Attribut `config` f├╝r alle Klassen aufgerufen werden
|
||||
die von `BrandNewBertPreTrainedModel` erben:
|
||||
|
||||
```python
|
||||
model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
|
||||
model.config # model has access to its config
|
||||
```
|
||||
|
||||
├Дhnlich wie das Modell erbt die Konfiguration grundlegende Serialisierungs- und Deserialisierungsfunktionalit├дten von
|
||||
[`PretrainedConfig`]. Beachten Sie, dass die Konfiguration und das Modell immer in zwei verschiedene Formate serialisiert werden
|
||||
unterschiedliche Formate serialisiert werden - das Modell in eine *pytorch_model.bin* Datei und die Konfiguration in eine *config.json* Datei. Aufruf von
|
||||
[~PreTrainedModel.save_pretrained`] wird automatisch
|
||||
[~PretrainedConfig.save_pretrained`] auf, so dass sowohl das Modell als auch die Konfiguration gespeichert werden.
|
||||
|
||||
|
||||
### Code-Stil
|
||||
|
||||
Wenn Sie Ihr neues Modell kodieren, sollten Sie daran denken, dass Transformers eine Bibliothek mit vielen Meinungen ist und dass wir selbst ein paar Macken haben
|
||||
wie der Code geschrieben werden sollte :-)
|
||||
|
||||
1. Der Vorw├дrtsdurchlauf Ihres Modells sollte vollst├дndig in die Modellierungsdatei geschrieben werden und dabei v├╢llig unabh├дngig von anderen
|
||||
Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden m├╢chten, kopieren Sie den Code und f├╝gen ihn mit einem
|
||||
`# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
|
||||
f├╝r ein gutes Beispiel und [hier](pr_checks#check-copies) f├╝r weitere Dokumentation zu Copied from).
|
||||
2. Der Code sollte vollst├дndig verst├дndlich sein, auch f├╝r einen Nicht-Muttersprachler. Das hei├Яt, Sie sollten
|
||||
beschreibende Variablennamen w├дhlen und Abk├╝rzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
|
||||
Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
|
||||
3. Generell ziehen wir l├дngeren expliziten Code einem kurzen magischen Code vor.
|
||||
4. Vermeiden Sie die Unterklassifizierung von `nn.Sequential` in PyTorch, sondern unterklassifizieren Sie `nn.Module` und schreiben Sie den Vorw├дrtspass, so dass jeder
|
||||
so dass jeder, der Ihren Code verwendet, ihn schnell debuggen kann, indem er Druckanweisungen oder Haltepunkte hinzuf├╝gt.
|
||||
5. Ihre Funktionssignatur sollte mit einer Typ-Annotation versehen sein. Im ├Ьbrigen sind gute Variablennamen viel lesbarer und verst├дndlicher
|
||||
verst├дndlicher als Typ-Anmerkungen.
|
||||
|
||||
### ├Ьbersicht der Tokenizer
|
||||
|
||||
Noch nicht ganz fertig :-( Dieser Abschnitt wird bald hinzugef├╝gt!
|
||||
|
||||
## Schritt-f├╝r-Schritt-Rezept zum Hinzuf├╝gen eines Modells zu ЁЯдЧ Transformers
|
||||
|
||||
Jeder hat andere Vorlieben, was die Portierung eines Modells angeht. Daher kann es sehr hilfreich sein, wenn Sie sich Zusammenfassungen ansehen
|
||||
wie andere Mitwirkende Modelle auf Hugging Face portiert haben. Hier ist eine Liste von Blogbeitr├дgen aus der Community, wie man ein Modell portiert:
|
||||
|
||||
1. [Portierung eines GPT2-Modells](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) von [Thomas](https://huggingface.co/thomwolf)
|
||||
2. [Portierung des WMT19 MT-Modells](https://huggingface.co/blog/porting-fsmt) von [Stas](https://huggingface.co/stas)
|
||||
|
||||
Aus Erfahrung k├╢nnen wir Ihnen sagen, dass die wichtigsten Dinge, die Sie beim Hinzuf├╝gen eines Modells beachten m├╝ssen, sind:
|
||||
|
||||
- Erfinden Sie das Rad nicht neu! Die meisten Teile des Codes, den Sie f├╝r das neue ЁЯдЧ Transformers-Modell hinzuf├╝gen werden, existieren bereits
|
||||
irgendwo in ЁЯдЧ Transformers. Nehmen Sie sich etwas Zeit, um ├дhnliche, bereits vorhandene Modelle und Tokenizer zu finden, die Sie kopieren k├╢nnen
|
||||
von. [grep](https://www.gnu.org/software/grep/) und [rg](https://github.com/BurntSushi/ripgrep) sind Ihre
|
||||
Freunde. Beachten Sie, dass es sehr gut m├╢glich ist, dass der Tokenizer Ihres Modells auf einer Modellimplementierung basiert und
|
||||
und der Modellierungscode Ihres Modells auf einer anderen. *Z.B.* Der Modellierungscode von FSMT basiert auf BART, w├дhrend der Tokenizer-Code von FSMT
|
||||
auf XLM basiert.
|
||||
- Es handelt sich eher um eine technische als um eine wissenschaftliche Herausforderung. Sie sollten mehr Zeit auf die Schaffung einer
|
||||
eine effiziente Debugging-Umgebung zu schaffen, als zu versuchen, alle theoretischen Aspekte des Modells in dem Papier zu verstehen.
|
||||
- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Modelle sind der Kernbestandteil von ЁЯдЧ Transformers, so dass wir bei Hugging Face mehr als
|
||||
mehr als gl├╝cklich, Ihnen bei jedem Schritt zu helfen, um Ihr Modell hinzuzuf├╝gen. Z├╢gern Sie nicht zu fragen, wenn Sie merken, dass Sie nicht weiterkommen.
|
||||
Fortschritte machen.
|
||||
|
||||
Im Folgenden versuchen wir, Ihnen ein allgemeines Rezept an die Hand zu geben, das uns bei der Portierung eines Modells auf ЁЯдЧ Transformers am n├╝tzlichsten erschien.
|
||||
|
||||
Die folgende Liste ist eine Zusammenfassung all dessen, was getan werden muss, um ein Modell hinzuzuf├╝gen und kann von Ihnen als To-Do verwendet werden
|
||||
Liste verwenden:
|
||||
|
||||
тШР (Optional) Verstehen der theoretischen Aspekte des Modells<br>
|
||||
тШР Vorbereiten der ЁЯдЧ Transformers-Entwicklungsumgebung<br>
|
||||
тШР Debugging-Umgebung des urspr├╝nglichen Repositorys eingerichtet<br>
|
||||
тШР Skript erstellt, das den Durchlauf `forward()` unter Verwendung des urspr├╝nglichen Repositorys und des Checkpoints erfolgreich durchf├╝hrt<br>
|
||||
тШР Erfolgreich das Modellskelett zu ЁЯдЧ Transformers hinzugef├╝gt<br>
|
||||
тШР Erfolgreiche Umwandlung des urspr├╝nglichen Pr├╝fpunkts in den ЁЯдЧ Transformers-Pr├╝fpunkt<br>
|
||||
тШР Erfolgreich den Durchlauf `forward()` in ЁЯдЧ Transformers ausgef├╝hrt, der eine identische Ausgabe wie der urspr├╝ngliche Pr├╝fpunkt liefert<br>
|
||||
тШР Modell-Tests in ЁЯдЧ Transformers abgeschlossen<br>
|
||||
тШР Erfolgreich Tokenizer in ЁЯдЧ Transformers hinzugef├╝gt<br>
|
||||
тШР End-to-End-Integrationstests ausgef├╝hrt<br>
|
||||
тШР Docs fertiggestellt<br>
|
||||
тШР Modellgewichte in den Hub hochgeladen<br>
|
||||
тШР Die Pull-Anfrage eingereicht<br>
|
||||
тШР (Optional) Hinzuf├╝gen eines Demo-Notizbuchs
|
||||
|
||||
F├╝r den Anfang empfehlen wir in der Regel, mit einem guten theoretischen Verst├дndnis von `BrandNewBert` zu beginnen. Wie auch immer,
|
||||
wenn Sie es vorziehen, die theoretischen Aspekte des Modells *on-the-job* zu verstehen, dann ist es v├╢llig in Ordnung, direkt in die
|
||||
in die Code-Basis von `BrandNewBert` einzutauchen. Diese Option k├╢nnte f├╝r Sie besser geeignet sein, wenn Ihre technischen F├дhigkeiten besser sind als
|
||||
als Ihre theoretischen F├дhigkeiten, wenn Sie Schwierigkeiten haben, die Arbeit von `BrandNewBert` zu verstehen, oder wenn Sie einfach Spa├Я am Programmieren
|
||||
mehr Spa├Я am Programmieren haben als am Lesen wissenschaftlicher Abhandlungen.
|
||||
|
||||
### 1. (Optional) Theoretische Aspekte von BrandNewBert
|
||||
|
||||
Sie sollten sich etwas Zeit nehmen, um die Abhandlung von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. M├╢glicherweise gibt es gro├Яe
|
||||
Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist
|
||||
ist es nicht, ein tiefes theoretisches Verst├дndnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um
|
||||
das Modell effektiv in ЁЯдЧ Transformers zu implementieren. Das hei├Яt, Sie m├╝ssen nicht zu viel Zeit auf die
|
||||
theoretischen Aspekten verbringen, sondern sich lieber auf die praktischen Aspekte konzentrieren, n├дmlich:
|
||||
|
||||
- Welche Art von Modell ist *brand_new_bert*? BERT-├дhnliches Modell nur f├╝r den Encoder? GPT2-├дhnliches reines Decoder-Modell? BART-├дhnliches
|
||||
Encoder-Decoder-Modell? Sehen Sie sich die [model_summary](model_summary) an, wenn Sie mit den Unterschieden zwischen diesen Modellen nicht vertraut sind.
|
||||
- Was sind die Anwendungen von *brand_new_bert*? Textklassifizierung? Texterzeugung? Seq2Seq-Aufgaben, *z.B.,*
|
||||
Zusammenfassungen?
|
||||
- Was ist die neue Eigenschaft des Modells, die es von BERT/GPT-2/BART unterscheidet?
|
||||
- Welches der bereits existierenden [ЁЯдЧ Transformers-Modelle](https://huggingface.co/transformers/#contents) ist am ├дhnlichsten
|
||||
├дhnlich wie *brand_new_bert*?
|
||||
- Welche Art von Tokenizer wird verwendet? Ein Satzteil-Tokenisierer? Ein Wortst├╝ck-Tokenisierer? Ist es derselbe Tokenisierer, der f├╝r
|
||||
f├╝r BERT oder BART?
|
||||
|
||||
Nachdem Sie das Gef├╝hl haben, einen guten ├Ьberblick ├╝ber die Architektur des Modells erhalten zu haben, k├╢nnen Sie dem
|
||||
Hugging Face Team schreiben und Ihre Fragen stellen. Dazu k├╢nnen Fragen zur Architektur des Modells geh├╢ren,
|
||||
seiner Aufmerksamkeitsebene usw. Wir werden Ihnen gerne weiterhelfen.
|
||||
|
||||
### 2. Bereiten Sie als n├дchstes Ihre Umgebung vor
|
||||
|
||||
1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die Schaltfl├дche 'Fork' klicken.
|
||||
Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt.
|
||||
|
||||
2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und f├╝gen Sie das Basis-Repository als Remote hinzu:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/[your Github handle]/transformers.git
|
||||
cd transformers
|
||||
git remote add upstream https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausf├╝hren:
|
||||
|
||||
```bash
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
Abh├дngig von Ihrem Betriebssystem und da die Anzahl der optionalen Abh├дngigkeiten von Transformers w├дchst, kann es sein, dass Sie bei diesem Befehl einen
|
||||
Fehler mit diesem Befehl. Stellen Sie in diesem Fall sicher, dass Sie das Deep Learning Framework, mit dem Sie arbeiten, installieren
|
||||
(PyTorch, TensorFlow und/oder Flax) und f├╝hren Sie es aus:
|
||||
|
||||
```bash
|
||||
pip install -e ".[quality]"
|
||||
```
|
||||
|
||||
was f├╝r die meisten Anwendungsf├дlle ausreichend sein sollte. Sie k├╢nnen dann zum ├╝bergeordneten Verzeichnis zur├╝ckkehren
|
||||
|
||||
```bash
|
||||
cd ..
|
||||
```
|
||||
|
||||
4. Wir empfehlen, die PyTorch-Version von *brand_new_bert* zu Transformers hinzuzuf├╝gen. Um PyTorch zu installieren, folgen Sie bitte den
|
||||
Anweisungen auf https://pytorch.org/get-started/locally/.
|
||||
|
||||
**Anmerkung:** Sie m├╝ssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU zum Laufen zu bringen.
|
||||
|
||||
5. Um *brand_new_bert* zu portieren, ben├╢tigen Sie au├Яerdem Zugriff auf das Original-Repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
|
||||
cd brand_new_bert
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *brand_new_bert* auf ЁЯдЧ Transformers zu portieren.
|
||||
|
||||
### 3.-4. F├╝hren Sie einen Pre-Training-Checkpoint mit dem Original-Repository durch
|
||||
|
||||
Zun├дchst werden Sie mit dem urspr├╝nglichen *brand_new_bert* Repository arbeiten. Oft ist die urspr├╝ngliche Implementierung sehr
|
||||
"forschungslastig". Das bedeutet, dass es an Dokumentation mangeln kann und der Code schwer zu verstehen sein kann. Aber das sollte
|
||||
genau Ihre Motivation sein, *brand_new_bert* neu zu implementieren. Eines unserer Hauptziele bei Hugging Face ist es, *die Menschen dazu zu bringen
|
||||
auf den Schultern von Giganten zu stehen*, was sich hier sehr gut darin ausdr├╝ckt, dass wir ein funktionierendes Modell nehmen und es umschreiben, um es so
|
||||
es so **zug├дnglich, benutzerfreundlich und sch├╢n** wie m├╢glich zu machen. Dies ist die wichtigste Motivation f├╝r die Neuimplementierung von
|
||||
Modelle in ЁЯдЧ Transformers umzuwandeln - der Versuch, komplexe neue NLP-Technologie f├╝r **jeden** zug├дnglich zu machen.
|
||||
|
||||
Sie sollten damit beginnen, indem Sie in das Original-Repository eintauchen.
|
||||
|
||||
Die erfolgreiche Ausf├╝hrung des offiziellen Pre-Trainingsmodells im Original-Repository ist oft **der schwierigste** Schritt.
|
||||
Unserer Erfahrung nach ist es sehr wichtig, dass Sie einige Zeit damit verbringen, sich mit der urspr├╝nglichen Code-Basis vertraut zu machen. Sie m├╝ssen
|
||||
das Folgende herausfinden:
|
||||
|
||||
- Wo finden Sie die vortrainierten Gewichte?
|
||||
- Wie l├дdt man die vorab trainierten Gewichte in das entsprechende Modell?
|
||||
- Wie kann der Tokenizer unabh├дngig vom Modell ausgef├╝hrt werden?
|
||||
- Verfolgen Sie einen Forward Pass, damit Sie wissen, welche Klassen und Funktionen f├╝r einen einfachen Forward Pass erforderlich sind. Normalerweise,
|
||||
m├╝ssen Sie nur diese Funktionen reimplementieren.
|
||||
- Sie m├╝ssen in der Lage sein, die wichtigen Komponenten des Modells zu finden: Wo befindet sich die Klasse des Modells? Gibt es Unterklassen des Modells,
|
||||
*z.B.* EncoderModel, DecoderModel? Wo befindet sich die Selbstaufmerksamkeitsschicht? Gibt es mehrere verschiedene Aufmerksamkeitsebenen,
|
||||
*z.B.* *Selbstaufmerksamkeit*, *Kreuzaufmerksamkeit*...?
|
||||
- Wie k├╢nnen Sie das Modell in der urspr├╝nglichen Umgebung des Repo debuggen? M├╝ssen Sie *print* Anweisungen hinzuf├╝gen, k├╢nnen Sie
|
||||
mit einem interaktiven Debugger wie *ipdb* arbeiten oder sollten Sie eine effiziente IDE zum Debuggen des Modells verwenden, wie z.B. PyCharm?
|
||||
|
||||
Es ist sehr wichtig, dass Sie, bevor Sie mit der Portierung beginnen, den Code im Original-Repository **effizient** debuggen k├╢nnen
|
||||
Repository k├╢nnen! Denken Sie auch daran, dass Sie mit einer Open-Source-Bibliothek arbeiten, also z├╢gern Sie nicht, ein Problem oder
|
||||
oder sogar eine Pull-Anfrage im Original-Repository zu stellen. Die Betreuer dieses Repositorys sind wahrscheinlich sehr froh dar├╝ber
|
||||
dass jemand in ihren Code schaut!
|
||||
|
||||
An diesem Punkt liegt es wirklich an Ihnen, welche Debugging-Umgebung und Strategie Sie zum Debuggen des urspr├╝nglichen
|
||||
Modell zu debuggen. Wir raten dringend davon ab, eine kostspielige GPU-Umgebung einzurichten, sondern arbeiten Sie einfach auf einer CPU, sowohl wenn Sie mit dem
|
||||
in das urspr├╝ngliche Repository einzutauchen und auch, wenn Sie beginnen, die ЁЯдЧ Transformers-Implementierung des Modells zu schreiben. Nur
|
||||
ganz am Ende, wenn das Modell bereits erfolgreich auf ЁЯдЧ Transformers portiert wurde, sollte man ├╝berpr├╝fen, ob das
|
||||
Modell auch auf der GPU wie erwartet funktioniert.
|
||||
|
||||
Im Allgemeinen gibt es zwei m├╢gliche Debugging-Umgebungen f├╝r die Ausf├╝hrung des Originalmodells
|
||||
|
||||
- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
|
||||
- Lokale Python-Skripte.
|
||||
|
||||
Jupyter-Notebooks haben den Vorteil, dass sie eine zellenweise Ausf├╝hrung erm├╢glichen, was hilfreich sein kann, um logische Komponenten besser voneinander zu trennen und
|
||||
logische Komponenten voneinander zu trennen und schnellere Debugging-Zyklen zu haben, da Zwischenergebnisse gespeichert werden k├╢nnen. Au├Яerdem,
|
||||
Au├Яerdem lassen sich Notebooks oft leichter mit anderen Mitwirkenden teilen, was sehr hilfreich sein kann, wenn Sie das Hugging Face Team um Hilfe bitten m├╢chten.
|
||||
Face Team um Hilfe bitten. Wenn Sie mit Jupyter-Notizb├╝chern vertraut sind, empfehlen wir Ihnen dringend, mit ihnen zu arbeiten.
|
||||
|
||||
Der offensichtliche Nachteil von Jupyter-Notizb├╝chern ist, dass Sie, wenn Sie nicht daran gew├╢hnt sind, mit ihnen zu arbeiten, einige Zeit damit verbringen m├╝ssen
|
||||
einige Zeit damit verbringen m├╝ssen, sich an die neue Programmierumgebung zu gew├╢hnen, und dass Sie m├╢glicherweise Ihre bekannten Debugging-Tools nicht mehr verwenden k├╢nnen
|
||||
wie z.B. `ipdb` nicht mehr verwenden k├╢nnen.
|
||||
|
||||
F├╝r jede Codebasis ist es immer ein guter erster Schritt, einen **kleinen** vortrainierten Checkpoint zu laden und in der Lage zu sein, einen
|
||||
einzelnen Vorw├дrtsdurchlauf mit einem Dummy-Integer-Vektor von Eingabe-IDs als Eingabe zu reproduzieren. Ein solches Skript k├╢nnte wie folgt aussehen (in
|
||||
Pseudocode):
|
||||
|
||||
```python
|
||||
model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
|
||||
input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids
|
||||
original_output = model.predict(input_ids)
|
||||
```
|
||||
|
||||
Was die Debugging-Strategie anbelangt, so k├╢nnen Sie im Allgemeinen aus mehreren Strategien w├дhlen:
|
||||
|
||||
- Zerlegen Sie das urspr├╝ngliche Modell in viele kleine testbare Komponenten und f├╝hren Sie f├╝r jede dieser Komponenten einen Vorw├дrtsdurchlauf zur
|
||||
├Ьberpr├╝fung
|
||||
- Zerlegen Sie das urspr├╝ngliche Modell nur in den urspr├╝nglichen *Tokenizer* und das urspr├╝ngliche *Modell*, f├╝hren Sie einen Vorw├дrtsdurchlauf f├╝r diese Komponenten durch
|
||||
und verwenden Sie dazwischenliegende Druckanweisungen oder Haltepunkte zur ├Ьberpr├╝fung.
|
||||
|
||||
Auch hier bleibt es Ihnen ├╝berlassen, welche Strategie Sie w├дhlen. Oft ist die eine oder die andere Strategie vorteilhaft, je nach der urspr├╝nglichen Codebasis
|
||||
Basis.
|
||||
|
||||
Wenn die urspr├╝ngliche Codebasis es Ihnen erlaubt, das Modell in kleinere Teilkomponenten zu zerlegen, *z.B.* wenn die urspr├╝ngliche
|
||||
Code-Basis problemlos im Eager-Modus ausgef├╝hrt werden kann, lohnt es sich in der Regel, dies zu tun. Es gibt einige wichtige Vorteile
|
||||
am Anfang den schwierigeren Weg zu gehen:
|
||||
|
||||
- Wenn Sie sp├дter das urspr├╝ngliche Modell mit der Hugging Face-Implementierung vergleichen, k├╢nnen Sie automatisch ├╝berpr├╝fen, ob
|
||||
f├╝r jede Komponente einzeln ├╝berpr├╝fen, ob die entsprechende Komponente der ЁЯдЧ Transformers-Implementierung ├╝bereinstimmt, anstatt sich auf
|
||||
anstatt sich auf den visuellen Vergleich ├╝ber Druckanweisungen zu verlassen
|
||||
- k├╢nnen Sie das gro├Яe Problem der Portierung eines Modells in kleinere Probleme der Portierung einzelner Komponenten zerlegen
|
||||
einzelnen Komponenten zu zerlegen und so Ihre Arbeit besser zu strukturieren
|
||||
- Die Aufteilung des Modells in logisch sinnvolle Komponenten hilft Ihnen, einen besseren ├Ьberblick ├╝ber das Design des Modells zu bekommen
|
||||
und somit das Modell besser zu verstehen
|
||||
- In einem sp├дteren Stadium helfen Ihnen diese komponentenweisen Tests dabei, sicherzustellen, dass keine Regressionen auftreten, w├дhrend Sie fortfahren
|
||||
Ihren Code ├дndern
|
||||
|
||||
[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) Integrationstests f├╝r ELECTRA
|
||||
gibt ein sch├╢nes Beispiel daf├╝r, wie dies geschehen kann.
|
||||
|
||||
Wenn die urspr├╝ngliche Codebasis jedoch sehr komplex ist oder nur die Ausf├╝hrung von Zwischenkomponenten in einem kompilierten Modus erlaubt,
|
||||
k├╢nnte es zu zeitaufw├дndig oder sogar unm├╢glich sein, das Modell in kleinere testbare Teilkomponenten zu zerlegen. Ein gutes
|
||||
Beispiel ist die [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) Bibliothek, die sehr komplex ist
|
||||
sehr komplex ist und keine einfache M├╢glichkeit bietet, das Modell in seine Unterkomponenten zu zerlegen. Bei solchen Bibliotheken ist man
|
||||
oft auf die ├Ьberpr├╝fung von Druckanweisungen angewiesen.
|
||||
|
||||
Unabh├дngig davon, welche Strategie Sie w├дhlen, ist die empfohlene Vorgehensweise oft die gleiche, n├дmlich dass Sie mit der Fehlersuche in den
|
||||
die Anfangsebenen zuerst und die Endebenen zuletzt debuggen.
|
||||
|
||||
Es wird empfohlen, dass Sie die Ausgaben der folgenden Ebenen abrufen, entweder durch Druckanweisungen oder Unterkomponentenfunktionen
|
||||
Schichten in der folgenden Reihenfolge abrufen:
|
||||
|
||||
1. Rufen Sie die Eingabe-IDs ab, die an das Modell ├╝bergeben wurden
|
||||
2. Rufen Sie die Worteinbettungen ab
|
||||
3. Rufen Sie die Eingabe der ersten Transformer-Schicht ab
|
||||
4. Rufen Sie die Ausgabe der ersten Transformer-Schicht ab
|
||||
5. Rufen Sie die Ausgabe der folgenden n - 1 Transformer-Schichten ab
|
||||
6. Rufen Sie die Ausgabe des gesamten BrandNewBert Modells ab
|
||||
|
||||
Die Eingabe-IDs sollten dabei aus einem Array von Ganzzahlen bestehen, *z.B.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
|
||||
|
||||
Die Ausgaben der folgenden Schichten bestehen oft aus mehrdimensionalen Float-Arrays und k├╢nnen wie folgt aussehen:
|
||||
|
||||
```
|
||||
[[
|
||||
[-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024],
|
||||
[-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132],
|
||||
[-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648],
|
||||
...,
|
||||
[-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288],
|
||||
[-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191],
|
||||
[-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]],
|
||||
```
|
||||
|
||||
Wir erwarten, dass jedes zu ЁЯдЧ Transformers hinzugef├╝gte Modell eine Reihe von Integrationstests besteht, was bedeutet, dass das urspr├╝ngliche
|
||||
Modell und die neu implementierte Version in ЁЯдЧ Transformers exakt dieselbe Ausgabe liefern m├╝ssen, und zwar mit einer Genauigkeit von 0,001!
|
||||
Da es normal ist, dass das exakt gleiche Modell, das in verschiedenen Bibliotheken geschrieben wurde, je nach Bibliotheksrahmen eine leicht unterschiedliche Ausgabe liefern kann
|
||||
eine leicht unterschiedliche Ausgabe liefern kann, akzeptieren wir eine Fehlertoleranz von 1e-3 (0,001). Es reicht nicht aus, wenn das Modell
|
||||
fast das gleiche Ergebnis liefert, sie m├╝ssen fast identisch sein. Daher werden Sie sicherlich die Zwischenergebnisse
|
||||
Zwischenergebnisse der ЁЯдЧ Transformers-Version mehrfach mit den Zwischenergebnissen der urspr├╝nglichen Implementierung von
|
||||
*brand_new_bert* vergleichen. In diesem Fall ist eine **effiziente** Debugging-Umgebung des urspr├╝nglichen Repositorys absolut
|
||||
wichtig ist. Hier sind einige Ratschl├дge, um Ihre Debugging-Umgebung so effizient wie m├╢glich zu gestalten.
|
||||
|
||||
- Finden Sie den besten Weg, um Zwischenergebnisse zu debuggen. Ist das urspr├╝ngliche Repository in PyTorch geschrieben? Dann sollten Sie
|
||||
dann sollten Sie sich wahrscheinlich die Zeit nehmen, ein l├дngeres Skript zu schreiben, das das urspr├╝ngliche Modell in kleinere Unterkomponenten zerlegt, um
|
||||
Zwischenwerte abzurufen. Ist das urspr├╝ngliche Repository in Tensorflow 1 geschrieben? Dann m├╝ssen Sie sich m├╢glicherweise auf die
|
||||
TensorFlow Druckoperationen wie [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) verlassen, um die
|
||||
Zwischenwerte auszugeben. Ist das urspr├╝ngliche Repository in Jax geschrieben? Dann stellen Sie sicher, dass das Modell **nicht jitted** ist, wenn
|
||||
wenn Sie den Vorw├дrtsdurchlauf ausf├╝hren, *z.B.* schauen Sie sich [dieser Link](https://github.com/google/jax/issues/196) an.
|
||||
- Verwenden Sie den kleinsten vortrainierten Pr├╝fpunkt, den Sie finden k├╢nnen. Je kleiner der Pr├╝fpunkt ist, desto schneller wird Ihr Debugging-Zyklus
|
||||
wird. Es ist nicht effizient, wenn Ihr vorab trainiertes Modell so gro├Я ist, dass Ihr Vorw├дrtsdurchlauf mehr als 10 Sekunden dauert.
|
||||
Falls nur sehr gro├Яe Checkpoints verf├╝gbar sind, kann es sinnvoller sein, ein Dummy-Modell in der neuen
|
||||
Umgebung mit zuf├дllig initialisierten Gewichten zu erstellen und diese Gewichte zum Vergleich mit der ЁЯдЧ Transformers-Version
|
||||
Ihres Modells
|
||||
- Vergewissern Sie sich, dass Sie den einfachsten Weg w├дhlen, um einen Forward Pass im urspr├╝nglichen Repository aufzurufen. Idealerweise sollten Sie
|
||||
die Funktion im originalen Repository finden, die **nur** einen einzigen Vorw├дrtspass aufruft, *d.h.* die oft aufgerufen wird
|
||||
Vorhersagen", "Auswerten", "Vorw├дrts" oder "Aufruf" genannt wird. Sie wollen keine Funktion debuggen, die `forward` aufruft
|
||||
mehrfach aufruft, *z.B.* um Text zu erzeugen, wie `autoregressive_sample`, `generate`.
|
||||
- Versuchen Sie, die Tokenisierung vom *Forward*-Pass des Modells zu trennen. Wenn das Original-Repository Beispiele zeigt, bei denen
|
||||
Sie eine Zeichenkette eingeben m├╝ssen, dann versuchen Sie herauszufinden, an welcher Stelle im Vorw├дrtsaufruf die Zeichenketteneingabe in Eingabe-IDs ge├дndert wird
|
||||
ge├дndert wird und beginnen Sie an dieser Stelle. Das k├╢nnte bedeuten, dass Sie m├╢glicherweise selbst ein kleines Skript schreiben oder den
|
||||
Originalcode so ├дndern m├╝ssen, dass Sie die ids direkt eingeben k├╢nnen, anstatt eine Zeichenkette einzugeben.
|
||||
- Vergewissern Sie sich, dass sich das Modell in Ihrem Debugging-Setup **nicht** im Trainingsmodus befindet, der oft dazu f├╝hrt, dass das Modell
|
||||
Dies f├╝hrt h├дufig zu zuf├дlligen Ergebnissen, da das Modell mehrere Dropout-Schichten enth├дlt. Stellen Sie sicher, dass der Vorw├дrtsdurchlauf in Ihrer Debugging
|
||||
Umgebung **deterministisch** ist, damit die Dropout-Schichten nicht verwendet werden. Oder verwenden Sie *transformers.utils.set_seed*.
|
||||
wenn sich die alte und die neue Implementierung im selben Framework befinden.
|
||||
|
||||
Im folgenden Abschnitt finden Sie genauere Details/Tipps, wie Sie dies f├╝r *brand_new_bert* tun k├╢nnen.
|
||||
|
||||
### 5.-14. Portierung von BrandNewBert auf ЁЯдЧ Transformatoren
|
||||
|
||||
Als n├дchstes k├╢nnen Sie endlich damit beginnen, neuen Code zu ЁЯдЧ Transformers hinzuzuf├╝gen. Gehen Sie in den Klon Ihres ЁЯдЧ Transformers Forks:
|
||||
|
||||
```bash
|
||||
cd transformers
|
||||
```
|
||||
|
||||
In dem speziellen Fall, dass Sie ein Modell hinzuf├╝gen, dessen Architektur genau mit der Modellarchitektur eines
|
||||
Modells ├╝bereinstimmt, m├╝ssen Sie nur ein Konvertierungsskript hinzuf├╝gen, wie in [diesem Abschnitt](#write-a-conversion-script) beschrieben.
|
||||
In diesem Fall k├╢nnen Sie einfach die gesamte Modellarchitektur des bereits vorhandenen Modells wiederverwenden.
|
||||
|
||||
Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Sie haben hier zwei M├╢glichkeiten:
|
||||
|
||||
- `transformers-cli add-new-model-like`, um ein neues Modell wie ein bestehendes hinzuzuf├╝gen
|
||||
- `transformers-cli add-new-model`, um ein neues Modell aus unserer Vorlage hinzuzuf├╝gen (sieht dann aus wie BERT oder Bart, je nachdem, welche Art von Modell Sie w├дhlen)
|
||||
|
||||
In beiden F├дllen werden Sie mit einem Fragebogen aufgefordert, die grundlegenden Informationen zu Ihrem Modell auszuf├╝llen. F├╝r den zweiten Befehl m├╝ssen Sie `cookiecutter` installieren, weitere Informationen dazu finden Sie [hier](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
|
||||
|
||||
**Er├╢ffnen Sie einen Pull Request auf dem Haupt-Repositorium huggingface/transformers**
|
||||
|
||||
Bevor Sie mit der Anpassung des automatisch generierten Codes beginnen, ist es nun an der Zeit, einen "Work in progress (WIP)" Pull
|
||||
Anfrage, *z.B.* "[WIP] Add *brand_new_bert*", in ЁЯдЧ Transformers zu ├╢ffnen, damit Sie und das Hugging Face Team
|
||||
Seite an Seite an der Integration des Modells in ЁЯдЧ Transformers arbeiten k├╢nnen.
|
||||
|
||||
Sie sollten Folgendes tun:
|
||||
|
||||
1. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung
|
||||
|
||||
```bash
|
||||
git checkout -b add_brand_new_bert
|
||||
```
|
||||
|
||||
2. Best├дtigen Sie den automatisch generierten Code:
|
||||
|
||||
```bash
|
||||
git add .
|
||||
git commit
|
||||
```
|
||||
|
||||
3. Abrufen und zur├╝cksetzen auf die aktuelle Haupt
|
||||
|
||||
```bash
|
||||
git fetch upstream
|
||||
git rebase upstream/main
|
||||
```
|
||||
|
||||
4. ├Ьbertragen Sie die ├Дnderungen auf Ihr Konto mit:
|
||||
|
||||
```bash
|
||||
git push -u origin a-descriptive-name-for-my-changes
|
||||
```
|
||||
|
||||
5. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das
|
||||
GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzuf├╝gen, damit das Hugging Face-Team ├╝ber zuk├╝nftige ├Дnderungen informiert wird.
|
||||
zuk├╝nftige ├Дnderungen benachrichtigt wird.
|
||||
|
||||
6. ├Дndern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken.
|
||||
|
||||
Vergessen Sie im Folgenden nicht, wenn Sie Fortschritte gemacht haben, Ihre Arbeit zu committen und in Ihr Konto zu pushen, damit sie in der Pull-Anfrage erscheint.
|
||||
damit sie in der Pull-Anfrage angezeigt wird. Au├Яerdem sollten Sie darauf achten, dass Sie Ihre Arbeit von Zeit zu Zeit mit dem aktuellen main
|
||||
von Zeit zu Zeit zu aktualisieren, indem Sie dies tun:
|
||||
|
||||
```bash
|
||||
git fetch upstream
|
||||
git merge upstream/main
|
||||
```
|
||||
|
||||
Generell sollten Sie alle Fragen, die Sie in Bezug auf das Modell oder Ihre Implementierung haben, in Ihrem PR stellen und
|
||||
in der PR diskutiert/gel├╢st werden. Auf diese Weise wird das Hugging Face Team immer benachrichtigt, wenn Sie neuen Code einreichen oder
|
||||
wenn Sie eine Frage haben. Es ist oft sehr hilfreich, das Hugging Face-Team auf Ihren hinzugef├╝gten Code hinzuweisen, damit das Hugging Face-Team Ihr Problem oder Ihre Frage besser verstehen kann.
|
||||
Face-Team Ihr Problem oder Ihre Frage besser verstehen kann.
|
||||
|
||||
Gehen Sie dazu auf die Registerkarte "Ge├дnderte Dateien", auf der Sie alle Ihre ├Дnderungen sehen, gehen Sie zu einer Zeile, zu der Sie eine Frage stellen m├╢chten
|
||||
eine Frage stellen m├╢chten, und klicken Sie auf das "+"-Symbol, um einen Kommentar hinzuzuf├╝gen. Wenn eine Frage oder ein Problem gel├╢st wurde,
|
||||
k├╢nnen Sie auf die Schaltfl├дche "L├╢sen" des erstellten Kommentars klicken.
|
||||
|
||||
Auf dieselbe Weise wird das Hugging Face-Team Kommentare ├╢ffnen, wenn es Ihren Code ├╝berpr├╝ft. Wir empfehlen, die meisten Fragen
|
||||
auf GitHub in Ihrem PR zu stellen. F├╝r einige sehr allgemeine Fragen, die f├╝r die ├Цffentlichkeit nicht sehr n├╝tzlich sind, k├╢nnen Sie das
|
||||
Hugging Face Team per Slack oder E-Mail zu stellen.
|
||||
|
||||
**5. Passen Sie den Code der generierten Modelle f├╝r brand_new_bert** an.
|
||||
|
||||
Zun├дchst werden wir uns nur auf das Modell selbst konzentrieren und uns nicht um den Tokenizer k├╝mmern. Den gesamten relevanten Code sollten Sie
|
||||
finden Sie in den generierten Dateien `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` und
|
||||
`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
|
||||
|
||||
Jetzt k├╢nnen Sie endlich mit dem Programmieren beginnen :). Der generierte Code in
|
||||
`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` wird entweder die gleiche Architektur wie BERT haben, wenn
|
||||
wenn es sich um ein reines Encoder-Modell handelt oder BART, wenn es sich um ein Encoder-Decoder-Modell handelt. An diesem Punkt sollten Sie sich daran erinnern, was
|
||||
was Sie am Anfang ├╝ber die theoretischen Aspekte des Modells gelernt haben: *Wie unterscheidet sich das Modell von BERT oder
|
||||
BART?*". Implementieren Sie diese ├Дnderungen, was oft bedeutet, dass Sie die *Selbstaufmerksamkeitsschicht*, die Reihenfolge der Normalisierungsschicht usw. ├дndern m├╝ssen.
|
||||
Schicht usw... Auch hier ist es oft n├╝tzlich, sich die ├дhnliche Architektur bereits bestehender Modelle in Transformers anzusehen, um ein besseres Gef├╝hl daf├╝r zu bekommen
|
||||
ein besseres Gef├╝hl daf├╝r zu bekommen, wie Ihr Modell implementiert werden sollte.
|
||||
|
||||
**Beachten Sie**, dass Sie an diesem Punkt nicht sehr sicher sein m├╝ssen, dass Ihr Code v├╢llig korrekt oder sauber ist. Vielmehr ist es
|
||||
Sie sollten vielmehr eine erste *unbereinigte*, kopierte Version des urspr├╝nglichen Codes in
|
||||
src/transformers/models/brand_new_bert/modeling_brand_new_bert.py" hinzuzuf├╝gen, bis Sie das Gef├╝hl haben, dass der gesamte notwendige Code
|
||||
hinzugef├╝gt wurde. Unserer Erfahrung nach ist es viel effizienter, schnell eine erste Version des erforderlichen Codes hinzuzuf├╝gen und
|
||||
den Code iterativ mit dem Konvertierungsskript zu verbessern/korrigieren, wie im n├дchsten Abschnitt beschrieben. Das einzige, was
|
||||
zu diesem Zeitpunkt funktionieren muss, ist, dass Sie die ЁЯдЧ Transformers-Implementierung von *brand_new_bert* instanziieren k├╢nnen, *d.h.* der
|
||||
folgende Befehl sollte funktionieren:
|
||||
|
||||
```python
|
||||
from transformers import BrandNewBertModel, BrandNewBertConfig
|
||||
|
||||
model = BrandNewBertModel(BrandNewBertConfig())
|
||||
```
|
||||
|
||||
Der obige Befehl erstellt ein Modell gem├д├Я den Standardparametern, die in `BrandNewBertConfig()` definiert sind, mit
|
||||
zuf├дlligen Gewichten und stellt damit sicher, dass die `init()` Methoden aller Komponenten funktionieren.
|
||||
|
||||
Beachten Sie, dass alle zuf├дlligen Initialisierungen in der Methode `_init_weights` Ihres `BrandnewBertPreTrainedModel` stattfinden sollten.
|
||||
Klasse erfolgen sollte. Sie sollte alle Blattmodule in Abh├дngigkeit von den Variablen der Konfiguration initialisieren. Hier ist ein Beispiel mit der
|
||||
BERT `_init_weights` Methode:
|
||||
|
||||
```py
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
if isinstance(module, nn.Linear):
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if module.padding_idx is not None:
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
```
|
||||
|
||||
Sie k├╢nnen weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung f├╝r einige Module ben├╢tigen. Zum Beispiel in
|
||||
`Wav2Vec2ForPreTraining` m├╝ssen die letzten beiden linearen Schichten die Initialisierung des regul├дren PyTorch `nn.Linear` haben.
|
||||
aber alle anderen sollten eine Initialisierung wie oben verwenden. Dies ist wie folgt kodiert:
|
||||
|
||||
```py
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
if isinstnace(module, Wav2Vec2ForPreTraining):
|
||||
module.project_hid.reset_parameters()
|
||||
module.project_q.reset_parameters()
|
||||
module.project_hid._is_hf_initialized = True
|
||||
module.project_q._is_hf_initialized = True
|
||||
elif isinstance(module, nn.Linear):
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
```
|
||||
|
||||
Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf
|
||||
True` f├╝r `module.project_q` und `module.project_hid` setzen, stellen wir sicher, dass die benutzerdefinierte Initialisierung, die wir vorgenommen haben, sp├дter nicht ├╝berschrieben wird,
|
||||
die Funktion `_init_weights` nicht auf sie angewendet wird.
|
||||
|
||||
**6. Schreiben Sie ein Konvertierungsskript**
|
||||
|
||||
Als n├дchstes sollten Sie ein Konvertierungsskript schreiben, mit dem Sie den Checkpoint, den Sie zum Debuggen von *brand_new_bert* im
|
||||
im urspr├╝nglichen Repository in einen Pr├╝fpunkt konvertieren, der mit Ihrer gerade erstellten ЁЯдЧ Transformers-Implementierung von
|
||||
*brand_new_bert*. Es ist nicht ratsam, das Konvertierungsskript von Grund auf neu zu schreiben, sondern die bereits
|
||||
bestehenden Konvertierungsskripten in ЁЯдЧ Transformers nach einem Skript zu suchen, das f├╝r die Konvertierung eines ├дhnlichen Modells verwendet wurde, das im
|
||||
demselben Framework wie *brand_new_bert* geschrieben wurde. Normalerweise reicht es aus, ein bereits vorhandenes Konvertierungsskript zu kopieren und
|
||||
es f├╝r Ihren Anwendungsfall leicht anzupassen. Z├╢gern Sie nicht, das Hugging Face Team zu bitten, Sie auf ein ├дhnliches, bereits vorhandenes
|
||||
Konvertierungsskript f├╝r Ihr Modell zu finden.
|
||||
|
||||
- Wenn Sie ein Modell von TensorFlow nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BERT [hier] (https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
|
||||
- Wenn Sie ein Modell von PyTorch nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BART [hier](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
|
||||
|
||||
Im Folgenden werden wir kurz erkl├дren, wie PyTorch-Modelle Ebenengewichte speichern und Ebenennamen definieren. In PyTorch wird der
|
||||
Name einer Ebene durch den Namen des Klassenattributs definiert, das Sie der Ebene geben. Lassen Sie uns ein Dummy-Modell in
|
||||
PyTorch, das wir `SimpleModel` nennen, wie folgt:
|
||||
|
||||
```python
|
||||
from torch import nn
|
||||
|
||||
|
||||
class SimpleModel(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(10, 10)
|
||||
self.intermediate = nn.Linear(10, 10)
|
||||
self.layer_norm = nn.LayerNorm(10)
|
||||
```
|
||||
|
||||
Jetzt k├╢nnen wir eine Instanz dieser Modelldefinition erstellen, die alle Gewichte ausf├╝llt: `dense`, `intermediate`,
|
||||
`layer_norm` mit zuf├дlligen Gewichten. Wir k├╢nnen das Modell ausdrucken, um seine Architektur zu sehen
|
||||
|
||||
```python
|
||||
model = SimpleModel()
|
||||
|
||||
print(model)
|
||||
```
|
||||
|
||||
Dies gibt folgendes aus:
|
||||
|
||||
```
|
||||
SimpleModel(
|
||||
(dense): Linear(in_features=10, out_features=10, bias=True)
|
||||
(intermediate): Linear(in_features=10, out_features=10, bias=True)
|
||||
(layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
|
||||
)
|
||||
```
|
||||
|
||||
Wir k├╢nnen sehen, dass die Ebenennamen durch den Namen des Klassenattributs in PyTorch definiert sind. Sie k├╢nnen die Gewichtswerte
|
||||
Werte einer bestimmten Ebene anzeigen lassen:
|
||||
|
||||
```python
|
||||
print(model.dense.weight.data)
|
||||
```
|
||||
|
||||
um zu sehen, dass die Gewichte zuf├дllig initialisiert wurden
|
||||
|
||||
```
|
||||
tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212,
|
||||
-0.2077, 0.2157],
|
||||
[ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190,
|
||||
0.2166, -0.0212],
|
||||
[-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950,
|
||||
-0.1023, -0.0447],
|
||||
[-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415,
|
||||
-0.1876, -0.2467],
|
||||
[ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
|
||||
0.2577, 0.0402],
|
||||
[ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604,
|
||||
0.2132, 0.1680],
|
||||
[ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090,
|
||||
0.2707, -0.2509],
|
||||
[-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407,
|
||||
0.1829, -0.1568],
|
||||
[-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923,
|
||||
0.0333, -0.0536],
|
||||
[-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739,
|
||||
0.2220, 0.2358]]).
|
||||
```
|
||||
|
||||
Im Konvertierungsskript sollten Sie diese zuf├дllig initialisierten Gewichte mit den genauen Gewichten der
|
||||
entsprechenden Ebene im Kontrollpunkt. *Z.B.*
|
||||
|
||||
```python
|
||||
# retrieve matching layer weights, e.g. by
|
||||
# recursive algorithm
|
||||
layer_name = "dense"
|
||||
pretrained_weight = array_of_dense_layer
|
||||
|
||||
model_pointer = getattr(model, "dense")
|
||||
|
||||
model_pointer.weight.data = torch.from_numpy(pretrained_weight)
|
||||
```
|
||||
|
||||
Dabei m├╝ssen Sie sicherstellen, dass jedes zuf├дllig initialisierte Gewicht Ihres PyTorch-Modells und sein entsprechendes
|
||||
Checkpoint-Gewicht in **Form und Name** genau ├╝bereinstimmen. Zu diesem Zweck ist es **notwendig**, assert
|
||||
Anweisungen f├╝r die Form hinzuzuf├╝gen und die Namen der Checkpoint-Gewichte auszugeben. Sie sollten z.B. Anweisungen hinzuf├╝gen wie:
|
||||
|
||||
```python
|
||||
assert (
|
||||
model_pointer.weight.shape == pretrained_weight.shape
|
||||
), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
|
||||
```
|
||||
|
||||
Au├Яerdem sollten Sie die Namen der beiden Gewichte ausdrucken, um sicherzustellen, dass sie ├╝bereinstimmen, *z.B.*.
|
||||
|
||||
```python
|
||||
logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
|
||||
```
|
||||
|
||||
Wenn entweder die Form oder der Name nicht ├╝bereinstimmt, haben Sie wahrscheinlich das falsche Kontrollpunktgewicht einer zuf├дllig
|
||||
Ebene der ЁЯдЧ Transformers-Implementierung zugewiesen.
|
||||
|
||||
Eine falsche Form ist h├╢chstwahrscheinlich auf eine falsche Einstellung der Konfigurationsparameter in `BrandNewBertConfig()` zur├╝ckzuf├╝hren, die
|
||||
nicht genau mit denen ├╝bereinstimmen, die f├╝r den zu konvertierenden Pr├╝fpunkt verwendet wurden. Es k├╢nnte aber auch sein, dass
|
||||
die PyTorch-Implementierung eines Layers erfordert, dass das Gewicht vorher transponiert wird.
|
||||
|
||||
Schlie├Яlich sollten Sie auch ├╝berpr├╝fen, ob **alle** erforderlichen Gewichte initialisiert sind und alle Checkpoint-Gewichte ausgeben, die
|
||||
die nicht zur Initialisierung verwendet wurden, um sicherzustellen, dass das Modell korrekt konvertiert wurde. Es ist v├╢llig normal, dass die
|
||||
Konvertierungsversuche entweder mit einer falschen Shape-Anweisung oder einer falschen Namenszuweisung fehlschlagen. Das liegt h├╢chstwahrscheinlich daran, dass entweder
|
||||
Sie haben falsche Parameter in `BrandNewBertConfig()` verwendet, haben eine falsche Architektur in der ЁЯдЧ Transformers
|
||||
Implementierung, Sie haben einen Fehler in den `init()` Funktionen einer der Komponenten der ЁЯдЧ Transformers
|
||||
Implementierung oder Sie m├╝ssen eine der Kontrollpunktgewichte transponieren.
|
||||
|
||||
Dieser Schritt sollte mit dem vorherigen Schritt wiederholt werden, bis alle Gewichte des Kontrollpunkts korrekt in das
|
||||
Transformers-Modell geladen sind. Nachdem Sie den Pr├╝fpunkt korrekt in die ЁЯдЧ Transformers-Implementierung geladen haben, k├╢nnen Sie das Modell
|
||||
das Modell unter einem Ordner Ihrer Wahl `/path/to/converted/checkpoint/folder` speichern, der dann sowohl ein
|
||||
Datei `pytorch_model.bin` und eine Datei `config.json` enthalten sollte:
|
||||
|
||||
```python
|
||||
model.save_pretrained("/path/to/converted/checkpoint/folder")
|
||||
```
|
||||
|
||||
**7. Implementieren Sie den Vorw├дrtspass**
|
||||
|
||||
Nachdem es Ihnen gelungen ist, die trainierten Gewichte korrekt in die ЁЯдЧ Transformers-Implementierung zu laden, sollten Sie nun daf├╝r sorgen
|
||||
sicherstellen, dass der Forward Pass korrekt implementiert ist. In [Machen Sie sich mit dem urspr├╝nglichen Repository vertraut](#34-run-a-pretrained-checkpoint-using-the-original-repository) haben Sie bereits ein Skript erstellt, das einen Forward Pass
|
||||
Durchlauf des Modells unter Verwendung des Original-Repositorys durchf├╝hrt. Jetzt sollten Sie ein analoges Skript schreiben, das die ЁЯдЧ Transformers
|
||||
Implementierung anstelle der Originalimplementierung verwenden. Es sollte wie folgt aussehen:
|
||||
|
||||
```python
|
||||
model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
|
||||
input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
|
||||
output = model(input_ids).last_hidden_states
|
||||
```
|
||||
|
||||
Es ist sehr wahrscheinlich, dass die ЁЯдЧ Transformers-Implementierung und die urspr├╝ngliche Modell-Implementierung nicht genau die gleiche Ausgabe liefern.
|
||||
beim ersten Mal nicht die gleiche Ausgabe liefern oder dass der Vorw├дrtsdurchlauf einen Fehler ausl├╢st. Seien Sie nicht entt├дuscht - das ist zu erwarten! Erstens,
|
||||
sollten Sie sicherstellen, dass der Vorw├дrtsdurchlauf keine Fehler ausl├╢st. Es passiert oft, dass die falschen Dimensionen verwendet werden
|
||||
verwendet werden, was zu einem *Dimensionality mismatch* Fehler f├╝hrt oder dass der falsche Datentyp verwendet wird, *z.B.* `torch.long`
|
||||
anstelle von `torch.float32`. Z├╢gern Sie nicht, das Hugging Face Team um Hilfe zu bitten, wenn Sie bestimmte Fehler nicht l├╢sen k├╢nnen.
|
||||
bestimmte Fehler nicht l├╢sen k├╢nnen.
|
||||
|
||||
Um sicherzustellen, dass die Implementierung von ЁЯдЧ Transformers korrekt funktioniert, m├╝ssen Sie sicherstellen, dass die Ausgaben
|
||||
einer Genauigkeit von `1e-3` entsprechen. Zun├дchst sollten Sie sicherstellen, dass die Ausgabeformen identisch sind, *d.h.*.
|
||||
Die Ausgabeform *outputs.shape* sollte f├╝r das Skript der ЁЯдЧ Transformers-Implementierung und die urspr├╝ngliche
|
||||
Implementierung ergeben. Als n├дchstes sollten Sie sicherstellen, dass auch die Ausgabewerte identisch sind. Dies ist einer der schwierigsten
|
||||
Teile des Hinzuf├╝gens eines neuen Modells. H├дufige Fehler, warum die Ausgaben nicht identisch sind, sind:
|
||||
|
||||
- Einige Ebenen wurden nicht hinzugef├╝gt, *d.h.* eine *Aktivierungsebene* wurde nicht hinzugef├╝gt, oder die Restverbindung wurde vergessen
|
||||
- Die Worteinbettungsmatrix wurde nicht gebunden
|
||||
- Es werden die falschen Positionseinbettungen verwendet, da die urspr├╝ngliche Implementierung einen Offset verwendet
|
||||
- Dropout wird w├дhrend des Vorw├дrtsdurchlaufs angewendet. Um dies zu beheben, stellen Sie sicher, dass *model.training auf False* steht und dass keine Dropout
|
||||
Schicht w├дhrend des Vorw├дrtsdurchlaufs f├дlschlicherweise aktiviert wird, *d.h.* ├╝bergeben Sie *self.training* an [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
|
||||
|
||||
Der beste Weg, das Problem zu beheben, besteht normalerweise darin, sich den Vorw├дrtsdurchlauf der urspr├╝nglichen Implementierung und die ЁЯдЧ
|
||||
Transformers-Implementierung nebeneinander zu sehen und zu pr├╝fen, ob es Unterschiede gibt. Idealerweise sollten Sie die
|
||||
Zwischenergebnisse beider Implementierungen des Vorw├дrtsdurchlaufs debuggen/ausdrucken, um die genaue Position im Netzwerk zu finden, an der die ЁЯдЧ
|
||||
Transformers-Implementierung eine andere Ausgabe zeigt als die urspr├╝ngliche Implementierung. Stellen Sie zun├дchst sicher, dass die
|
||||
hartcodierten `input_ids` in beiden Skripten identisch sind. ├Ьberpr├╝fen Sie dann, ob die Ausgaben der ersten Transformation von
|
||||
der `input_ids` (normalerweise die Worteinbettungen) identisch sind. Und dann arbeiten Sie sich bis zur allerletzten Schicht des
|
||||
Netzwerks. Irgendwann werden Sie einen Unterschied zwischen den beiden Implementierungen feststellen, der Sie auf den Fehler
|
||||
in der Implementierung von ЁЯдЧ Transformers hinweist. Unserer Erfahrung nach ist ein einfacher und effizienter Weg, viele Druckanweisungen hinzuzuf├╝gen
|
||||
sowohl in der Original-Implementierung als auch in der ЁЯдЧ Transformers-Implementierung an den gleichen Stellen im Netzwerk
|
||||
hinzuzuf├╝gen und nacheinander Druckanweisungen zu entfernen, die dieselben Werte f├╝r Zwischenpr├дsentationen anzeigen.
|
||||
|
||||
Wenn Sie sicher sind, dass beide Implementierungen die gleiche Ausgabe liefern, ├╝berpr├╝fen Sie die Ausgaben mit
|
||||
`torch.allclose(original_output, output, atol=1e-3)` ├╝berpr├╝fen, haben Sie den schwierigsten Teil hinter sich! Herzlichen Gl├╝ckwunsch - die
|
||||
Arbeit, die noch zu erledigen ist, sollte ein Kinderspiel sein ЁЯШК.
|
||||
|
||||
**8. Hinzuf├╝gen aller notwendigen Modelltests**
|
||||
|
||||
An diesem Punkt haben Sie erfolgreich ein neues Modell hinzugef├╝gt. Es ist jedoch sehr gut m├╢glich, dass das Modell noch nicht
|
||||
noch nicht vollst├дndig mit dem erforderlichen Design ├╝bereinstimmt. Um sicherzustellen, dass die Implementierung vollst├дndig kompatibel mit ЁЯдЧ Transformers ist, sollten alle
|
||||
gemeinsamen Tests bestehen. Der Cookiecutter sollte automatisch eine Testdatei f├╝r Ihr Modell hinzugef├╝gt haben, wahrscheinlich unter
|
||||
demselben `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. F├╝hren Sie diese Testdatei aus, um zu ├╝berpr├╝fen, ob alle g├дngigen
|
||||
Tests bestehen:
|
||||
|
||||
```bash
|
||||
pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
|
||||
```
|
||||
|
||||
Nachdem Sie alle allgemeinen Tests festgelegt haben, m├╝ssen Sie nun sicherstellen, dass all die sch├╢ne Arbeit, die Sie geleistet haben, gut getestet ist, damit
|
||||
|
||||
- a) die Community Ihre Arbeit leicht nachvollziehen kann, indem sie sich spezifische Tests von *brand_new_bert* ansieht
|
||||
- b) zuk├╝nftige ├Дnderungen an Ihrem Modell keine wichtigen Funktionen des Modells zerst├╢ren.
|
||||
|
||||
Als erstes sollten Sie Integrationstests hinzuf├╝gen. Diese Integrationstests tun im Wesentlichen dasselbe wie die Debugging-Skripte
|
||||
die Sie zuvor zur Implementierung des Modells in ЁЯдЧ Transformers verwendet haben. Eine Vorlage f├╝r diese Modelltests wurde bereits von dem
|
||||
Cookiecutter hinzugef├╝gt, die `BrandNewBertModelIntegrationTests` hei├Яt und nur noch von Ihnen ausgef├╝llt werden muss. Um sicherzustellen, dass diese
|
||||
Tests erfolgreich sind, f├╝hren Sie
|
||||
|
||||
```bash
|
||||
RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Falls Sie Windows verwenden, sollten Sie `RUN_SLOW=1` durch `SET RUN_SLOW=1` ersetzen.
|
||||
|
||||
</Tip>
|
||||
|
||||
Zweitens sollten alle Funktionen, die speziell f├╝r *brand_new_bert* sind, zus├дtzlich in einem separaten Test getestet werden unter
|
||||
`BrandNewBertModelTester`/``BrandNewBertModelTest`. Dieser Teil wird oft vergessen, ist aber in zweierlei Hinsicht ├дu├Яerst n├╝tzlich
|
||||
Weise:
|
||||
|
||||
- Er hilft dabei, das Wissen, das Sie w├дhrend der Modellerweiterung erworben haben, an die Community weiterzugeben, indem er zeigt, wie die
|
||||
speziellen Funktionen von *brand_new_bert* funktionieren sollten.
|
||||
- K├╝nftige Mitwirkende k├╢nnen ├Дnderungen am Modell schnell testen, indem sie diese speziellen Tests ausf├╝hren.
|
||||
|
||||
|
||||
**9. Implementieren Sie den Tokenizer**
|
||||
|
||||
Als n├дchstes sollten wir den Tokenizer von *brand_new_bert* hinzuf├╝gen. Normalerweise ist der Tokenizer ├дquivalent oder sehr ├дhnlich zu einem
|
||||
bereits vorhandenen Tokenizer von ЁЯдЧ Transformers.
|
||||
|
||||
Es ist sehr wichtig, die urspr├╝ngliche Tokenizer-Datei zu finden/extrahieren und es zu schaffen, diese Datei in die ЁЯдЧ
|
||||
Transformers Implementierung des Tokenizers zu laden.
|
||||
|
||||
Um sicherzustellen, dass der Tokenizer korrekt funktioniert, empfiehlt es sich, zun├дchst ein Skript im urspr├╝nglichen Repository zu erstellen
|
||||
zu erstellen, das eine Zeichenkette eingibt und die `input_ids` zur├╝ckgibt. Es k├╢nnte etwa so aussehen (in Pseudocode):
|
||||
|
||||
```python
|
||||
input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
|
||||
model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
|
||||
input_ids = model.tokenize(input_str)
|
||||
```
|
||||
|
||||
M├╢glicherweise m├╝ssen Sie noch einmal einen Blick in das urspr├╝ngliche Repository werfen, um die richtige Tokenizer-Funktion zu finden, oder Sie m├╝ssen
|
||||
Sie m├╝ssen vielleicht sogar ├Дnderungen an Ihrem Klon des Original-Repositorys vornehmen, um nur die `input_ids` auszugeben. Nach dem Schreiben
|
||||
ein funktionierendes Tokenisierungsskript geschrieben, das das urspr├╝ngliche Repository verwendet, sollten Sie ein analoges Skript f├╝r ЁЯдЧ Transformers
|
||||
erstellt werden. Es sollte ├дhnlich wie dieses aussehen:
|
||||
|
||||
```python
|
||||
from transformers import BrandNewBertTokenizer
|
||||
|
||||
input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
|
||||
|
||||
tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
|
||||
|
||||
input_ids = tokenizer(input_str).input_ids
|
||||
```
|
||||
|
||||
Wenn beide `input_ids` die gleichen Werte ergeben, sollte als letzter Schritt auch eine Tokenizer-Testdatei hinzugef├╝gt werden.
|
||||
|
||||
Analog zu den Modellierungstestdateien von *brand_new_bert* sollten auch die Tokenisierungs-Testdateien von *brand_new_bert*
|
||||
eine Reihe von fest kodierten Integrationstests enthalten.
|
||||
|
||||
**10. F├╝hren Sie End-to-End-Integrationstests aus**
|
||||
|
||||
Nachdem Sie den Tokenizer hinzugef├╝gt haben, sollten Sie auch ein paar End-to-End-Integrationstests, die sowohl das Modell als auch den
|
||||
Tokenizer zu `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in ЁЯдЧ Transformers.
|
||||
Ein solcher Test sollte bei einem aussagekr├дftigen
|
||||
Text-zu-Text-Beispiel zeigen, dass die Implementierung von ЁЯдЧ Transformers wie erwartet funktioniert. Ein aussagekr├дftiges Text-zu-Text-Beispiel kann
|
||||
z.B. *ein Quell-zu-Ziel-├Ьbersetzungspaar, ein Artikel-zu-Zusammenfassung-Paar, ein Frage-zu-Antwort-Paar, usw... Wenn keiner der
|
||||
der portierten Pr├╝fpunkte in einer nachgelagerten Aufgabe feinabgestimmt wurde, gen├╝gt es, sich einfach auf die Modelltests zu verlassen. In einem
|
||||
letzten Schritt, um sicherzustellen, dass das Modell voll funktionsf├дhig ist, sollten Sie alle Tests auch auf der GPU durchf├╝hren. Es kann
|
||||
Es kann vorkommen, dass Sie vergessen haben, einige `.to(self.device)` Anweisungen zu internen Tensoren des Modells hinzuzuf├╝gen, was in einem solchen
|
||||
Test zu einem Fehler f├╝hren w├╝rde. Falls Sie keinen Zugang zu einem Grafikprozessor haben, kann das Hugging Face Team diese Tests f├╝r Sie durchf├╝hren.
|
||||
Tests f├╝r Sie ├╝bernehmen.
|
||||
|
||||
**11. Docstring hinzuf├╝gen**
|
||||
|
||||
Nun sind alle notwendigen Funktionen f├╝r *brand_new_bert* hinzugef├╝gt - Sie sind fast fertig! Das Einzige, was Sie noch hinzuf├╝gen m├╝ssen, ist
|
||||
ein sch├╢ner Docstring und eine Doku-Seite. Der Cookiecutter sollte eine Vorlagendatei namens
|
||||
`docs/source/model_doc/brand_new_bert.md` hinzugef├╝gt haben, die Sie ausf├╝llen sollten. Die Benutzer Ihres Modells werden in der Regel zuerst einen Blick auf
|
||||
diese Seite ansehen, bevor sie Ihr Modell verwenden. Daher muss die Dokumentation verst├дndlich und pr├дgnant sein. Es ist sehr n├╝tzlich f├╝r
|
||||
die Gemeinschaft, einige *Tipps* hinzuzuf├╝gen, um zu zeigen, wie das Modell verwendet werden sollte. Z├╢gern Sie nicht, das Hugging Face-Team anzupingen
|
||||
bez├╝glich der Docstrings.
|
||||
|
||||
Stellen Sie als n├дchstes sicher, dass der zu `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` hinzugef├╝gte docstring
|
||||
korrekt ist und alle erforderlichen Eingaben und Ausgaben enth├дlt. Wir haben eine ausf├╝hrliche Anleitung zum Schreiben von Dokumentationen und unserem Docstring-Format [hier](writing-documentation). Es ist immer gut, sich daran zu erinnern, dass die Dokumentation
|
||||
mindestens so sorgf├дltig behandelt werden sollte wie der Code in ЁЯдЧ Transformers, denn die Dokumentation ist in der Regel der erste Kontaktpunkt der
|
||||
Ber├╝hrungspunkt der Community mit dem Modell ist.
|
||||
|
||||
**Code refactor**
|
||||
|
||||
Gro├Яartig, jetzt haben Sie den gesamten erforderlichen Code f├╝r *brand_new_bert* hinzugef├╝gt. An diesem Punkt sollten Sie einige m├╢gliche
|
||||
falschen Codestil korrigieren, indem Sie ausf├╝hren:
|
||||
|
||||
```bash
|
||||
make style
|
||||
```
|
||||
|
||||
und ├╝berpr├╝fen Sie, ob Ihr Kodierungsstil die Qualit├дtspr├╝fung besteht:
|
||||
|
||||
```bash
|
||||
make quality
|
||||
```
|
||||
|
||||
Es gibt noch ein paar andere sehr strenge Designtests in ЁЯдЧ Transformers, die m├╢glicherweise noch fehlschlagen, was sich in den
|
||||
den Tests Ihres Pull Requests. Dies liegt oft an fehlenden Informationen im Docstring oder an einer falschen
|
||||
Benennung. Das Hugging Face Team wird Ihnen sicherlich helfen, wenn Sie hier nicht weiterkommen.
|
||||
|
||||
Und schlie├Яlich ist es immer eine gute Idee, den eigenen Code zu refaktorisieren, nachdem man sichergestellt hat, dass er korrekt funktioniert. Wenn alle
|
||||
Tests bestanden haben, ist es nun an der Zeit, den hinzugef├╝gten Code noch einmal durchzugehen und einige ├Ьberarbeitungen vorzunehmen.
|
||||
|
||||
Sie haben nun den Codierungsteil abgeschlossen, herzlichen Gl├╝ckwunsch! ЁЯОЙ Sie sind gro├Яartig! ЁЯШО
|
||||
|
||||
**12. Laden Sie die Modelle in den Model Hub hoch**
|
||||
|
||||
In diesem letzten Teil sollten Sie alle Checkpoints konvertieren und in den Modell-Hub hochladen und eine Modellkarte f├╝r jeden
|
||||
hochgeladenen Modell-Kontrollpunkt. Sie k├╢nnen sich mit den Hub-Funktionen vertraut machen, indem Sie unsere [Model sharing and uploading Page](model_sharing) lesen. Hier sollten Sie mit dem Hugging Face-Team zusammenarbeiten, um einen passenden Namen f├╝r jeden
|
||||
Checkpoint festzulegen und die erforderlichen Zugriffsrechte zu erhalten, um das Modell unter der Organisation des Autors *brand_new_bert* hochladen zu k├╢nnen.
|
||||
*brand_new_bert*. Die Methode `push_to_hub`, die in allen Modellen in `transformers` vorhanden ist, ist ein schneller und effizienter Weg, Ihren Checkpoint in den Hub zu pushen. Ein kleines Snippet ist unten eingef├╝gt:
|
||||
|
||||
```python
|
||||
brand_new_bert.push_to_hub("brand_new_bert")
|
||||
# Uncomment the following line to push to an organization.
|
||||
# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
|
||||
```
|
||||
|
||||
Es lohnt sich, etwas Zeit darauf zu verwenden, f├╝r jeden Kontrollpunkt passende Musterkarten zu erstellen. Die Modellkarten sollten die
|
||||
spezifischen Merkmale dieses bestimmten Pr├╝fpunkts hervorheben, * z.B.* auf welchem Datensatz wurde der Pr├╝fpunkt
|
||||
vortrainiert/abgestimmt? F├╝r welche nachgelagerte Aufgabe sollte das Modell verwendet werden? Und f├╝gen Sie auch etwas Code bei, wie Sie
|
||||
wie das Modell korrekt verwendet wird.
|
||||
|
||||
**13. (Optional) Notizbuch hinzuf├╝gen**
|
||||
|
||||
Es ist sehr hilfreich, ein Notizbuch hinzuzuf├╝gen, in dem im Detail gezeigt wird, wie *brand_new_bert* f├╝r Schlussfolgerungen verwendet werden kann und/oder
|
||||
bei einer nachgelagerten Aufgabe feinabgestimmt wird. Dies ist nicht zwingend erforderlich, um Ihren PR zusammenzuf├╝hren, aber sehr n├╝tzlich f├╝r die Gemeinschaft.
|
||||
|
||||
**14. Reichen Sie Ihren fertigen PR ein**
|
||||
|
||||
Sie sind jetzt mit der Programmierung fertig und k├╢nnen zum letzten Schritt ├╝bergehen, n├дmlich der Zusammenf├╝hrung Ihres PR mit main. Normalerweise hat das
|
||||
Hugging Face Team Ihnen an diesem Punkt bereits geholfen haben, aber es lohnt sich, sich etwas Zeit zu nehmen, um Ihrem fertigen
|
||||
PR eine sch├╢ne Beschreibung zu geben und eventuell Kommentare zu Ihrem Code hinzuzuf├╝gen, wenn Sie Ihren Gutachter auf bestimmte Designentscheidungen hinweisen wollen.
|
||||
Gutachter hinweisen wollen.
|
||||
|
||||
### Teilen Sie Ihre Arbeit!!
|
||||
|
||||
Jetzt ist es an der Zeit, von der Community Anerkennung f├╝r Ihre Arbeit zu bekommen! Die Fertigstellung einer Modellerg├дnzung ist ein wichtiger
|
||||
Beitrag zu Transformers und der gesamten NLP-Gemeinschaft. Ihr Code und die portierten vortrainierten Modelle werden sicherlich
|
||||
von Hunderten und vielleicht sogar Tausenden von Entwicklern und Forschern genutzt werden. Sie sollten stolz auf Ihre Arbeit sein und Ihre
|
||||
Ihre Leistung mit der Gemeinschaft teilen.
|
||||
|
||||
**Sie haben ein weiteres Modell erstellt, das f├╝r jeden in der Community super einfach zug├дnglich ist! ЁЯдп**
|
||||
258
docs/source/de/add_new_pipeline.md
Normal file
258
docs/source/de/add_new_pipeline.md
Normal file
@ -0,0 +1,258 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Wie erstellt man eine benutzerdefinierte Pipeline?
|
||||
|
||||
In dieser Anleitung sehen wir uns an, wie Sie eine benutzerdefinierte Pipeline erstellen und sie auf dem [Hub](hf.co/models) freigeben oder sie der
|
||||
ЁЯдЧ Transformers-Bibliothek hinzuf├╝gen.
|
||||
|
||||
Zuallererst m├╝ssen Sie entscheiden, welche Roheingaben die Pipeline verarbeiten kann. Es kann sich um Strings, rohe Bytes,
|
||||
Dictionaries oder was auch immer die wahrscheinlichste gew├╝nschte Eingabe ist. Versuchen Sie, diese Eingaben so rein wie m├╢glich in Python zu halten
|
||||
denn das macht die Kompatibilit├дt einfacher (auch mit anderen Sprachen ├╝ber JSON). Dies werden die Eingaben der
|
||||
Pipeline (`Vorverarbeitung`).
|
||||
|
||||
Definieren Sie dann die `Outputs`. Dieselbe Richtlinie wie f├╝r die Eing├дnge. Je einfacher, desto besser. Dies werden die Ausgaben der
|
||||
Methode `Postprocess`.
|
||||
|
||||
Beginnen Sie damit, die Basisklasse `Pipeline` mit den 4 Methoden zu erben, die f├╝r die Implementierung von `preprocess` ben├╢tigt werden,
|
||||
Weiterleiten", "Nachbearbeitung" und "Parameter s├дubern".
|
||||
|
||||
|
||||
```python
|
||||
from transformers import Pipeline
|
||||
|
||||
|
||||
class MyPipeline(Pipeline):
|
||||
def _sanitize_parameters(self, **kwargs):
|
||||
preprocess_kwargs = {}
|
||||
if "maybe_arg" in kwargs:
|
||||
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
|
||||
return preprocess_kwargs, {}, {}
|
||||
|
||||
def preprocess(self, inputs, maybe_arg=2):
|
||||
model_input = Tensor(inputs["input_ids"])
|
||||
return {"model_input": model_input}
|
||||
|
||||
def _forward(self, model_inputs):
|
||||
# model_inputs == {"model_input": model_input}
|
||||
outputs = self.model(**model_inputs)
|
||||
# Maybe {"logits": Tensor(...)}
|
||||
return outputs
|
||||
|
||||
def postprocess(self, model_outputs):
|
||||
best_class = model_outputs["logits"].softmax(-1)
|
||||
return best_class
|
||||
```
|
||||
|
||||
Die Struktur dieser Aufteilung soll eine relativ nahtlose Unterst├╝tzung f├╝r CPU/GPU erm├╢glichen und gleichzeitig die Durchf├╝hrung von
|
||||
Vor-/Nachbearbeitung auf der CPU in verschiedenen Threads
|
||||
|
||||
Preprocess" nimmt die urspr├╝nglich definierten Eingaben und wandelt sie in etwas um, das in das Modell eingespeist werden kann. Es kann
|
||||
mehr Informationen enthalten und ist normalerweise ein `Dict`.
|
||||
|
||||
`_forward` ist das Implementierungsdetail und ist nicht daf├╝r gedacht, direkt aufgerufen zu werden. Weiterleiten" ist die bevorzugte
|
||||
aufgerufene Methode, da sie Sicherheitsvorkehrungen enth├дlt, die sicherstellen, dass alles auf dem erwarteten Ger├дt funktioniert. Wenn etwas
|
||||
mit einem realen Modell verkn├╝pft ist, geh├╢rt es in die Methode `_forward`, alles andere geh├╢rt in die Methoden preprocess/postprocess.
|
||||
|
||||
Die Methode `Postprocess` nimmt die Ausgabe von `_forward` und verwandelt sie in die endg├╝ltige Ausgabe, die zuvor festgelegt wurde.
|
||||
zuvor entschieden wurde.
|
||||
|
||||
Die Methode `_sanitize_parameters` erm├╢glicht es dem Benutzer, beliebige Parameter zu ├╝bergeben, wann immer er m├╢chte, sei es bei der Initialisierung
|
||||
Zeit `pipeline(...., maybe_arg=4)` oder zur Aufrufzeit `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
|
||||
|
||||
Die R├╝ckgabe von `_sanitize_parameters` sind die 3 Dicts von kwargs, die direkt an `preprocess` ├╝bergeben werden,
|
||||
`_forward` und `postprocess` ├╝bergeben werden. F├╝llen Sie nichts aus, wenn der Aufrufer keinen zus├дtzlichen Parameter angegeben hat. Das
|
||||
erlaubt es, die Standardargumente in der Funktionsdefinition beizubehalten, was immer "nat├╝rlicher" ist.
|
||||
|
||||
Ein klassisches Beispiel w├дre das Argument `top_k` in der Nachbearbeitung bei Klassifizierungsaufgaben.
|
||||
|
||||
```python
|
||||
>>> pipe = pipeline("my-new-task")
|
||||
>>> pipe("This is a test")
|
||||
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
|
||||
{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
|
||||
|
||||
>>> pipe("This is a test", top_k=2)
|
||||
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
|
||||
```
|
||||
|
||||
In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
|
||||
`_sanitize_parameters` to allow this new parameter.
|
||||
|
||||
|
||||
```python
|
||||
def postprocess(self, model_outputs, top_k=5):
|
||||
best_class = model_outputs["logits"].softmax(-1)
|
||||
# Add logic to handle top_k
|
||||
return best_class
|
||||
|
||||
|
||||
def _sanitize_parameters(self, **kwargs):
|
||||
preprocess_kwargs = {}
|
||||
if "maybe_arg" in kwargs:
|
||||
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
|
||||
|
||||
postprocess_kwargs = {}
|
||||
if "top_k" in kwargs:
|
||||
postprocess_kwargs["top_k"] = kwargs["top_k"]
|
||||
return preprocess_kwargs, {}, postprocess_kwargs
|
||||
```
|
||||
|
||||
Versuchen Sie, die Eingaben/Ausgaben sehr einfach und idealerweise JSON-serialisierbar zu halten, da dies die Verwendung der Pipeline sehr einfach macht
|
||||
ohne dass die Benutzer neue Arten von Objekten verstehen m├╝ssen. Es ist auch relativ ├╝blich, viele verschiedene Arten von Argumenten zu unterst├╝tzen
|
||||
von Argumenten zu unterst├╝tzen (Audiodateien, die Dateinamen, URLs oder reine Bytes sein k├╢nnen).
|
||||
|
||||
|
||||
|
||||
## Hinzuf├╝gen zur Liste der unterst├╝tzten Aufgaben
|
||||
|
||||
Um Ihre `neue Aufgabe` in die Liste der unterst├╝tzten Aufgaben aufzunehmen, m├╝ssen Sie sie zur `PIPELINE_REGISTRY` hinzuf├╝gen:
|
||||
|
||||
```python
|
||||
from transformers.pipelines import PIPELINE_REGISTRY
|
||||
|
||||
PIPELINE_REGISTRY.register_pipeline(
|
||||
"new-task",
|
||||
pipeline_class=MyPipeline,
|
||||
pt_model=AutoModelForSequenceClassification,
|
||||
)
|
||||
```
|
||||
|
||||
Wenn Sie m├╢chten, k├╢nnen Sie ein Standardmodell angeben. In diesem Fall sollte es mit einer bestimmten Revision (die der Name einer Verzweigung oder ein Commit-Hash sein kann, hier haben wir `"abcdef"` genommen) sowie mit dem Typ versehen sein:
|
||||
|
||||
```python
|
||||
PIPELINE_REGISTRY.register_pipeline(
|
||||
"new-task",
|
||||
pipeline_class=MyPipeline,
|
||||
pt_model=AutoModelForSequenceClassification,
|
||||
default={"pt": ("user/awesome_model", "abcdef")},
|
||||
type="text", # current support type: text, audio, image, multimodal
|
||||
)
|
||||
```
|
||||
|
||||
## Teilen Sie Ihre Pipeline auf dem Hub
|
||||
|
||||
Um Ihre benutzerdefinierte Pipeline auf dem Hub freizugeben, m├╝ssen Sie lediglich den benutzerdefinierten Code Ihrer `Pipeline`-Unterklasse in einer
|
||||
Python-Datei speichern. Nehmen wir zum Beispiel an, Sie m├╢chten eine benutzerdefinierte Pipeline f├╝r die Klassifizierung von Satzpaaren wie folgt verwenden:
|
||||
|
||||
```py
|
||||
import numpy as np
|
||||
|
||||
from transformers import Pipeline
|
||||
|
||||
|
||||
def softmax(outputs):
|
||||
maxes = np.max(outputs, axis=-1, keepdims=True)
|
||||
shifted_exp = np.exp(outputs - maxes)
|
||||
return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
|
||||
|
||||
|
||||
class PairClassificationPipeline(Pipeline):
|
||||
def _sanitize_parameters(self, **kwargs):
|
||||
preprocess_kwargs = {}
|
||||
if "second_text" in kwargs:
|
||||
preprocess_kwargs["second_text"] = kwargs["second_text"]
|
||||
return preprocess_kwargs, {}, {}
|
||||
|
||||
def preprocess(self, text, second_text=None):
|
||||
return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
|
||||
|
||||
def _forward(self, model_inputs):
|
||||
return self.model(**model_inputs)
|
||||
|
||||
def postprocess(self, model_outputs):
|
||||
logits = model_outputs.logits[0].numpy()
|
||||
probabilities = softmax(logits)
|
||||
|
||||
best_class = np.argmax(probabilities)
|
||||
label = self.model.config.id2label[best_class]
|
||||
score = probabilities[best_class].item()
|
||||
logits = logits.tolist()
|
||||
return {"label": label, "score": score, "logits": logits}
|
||||
```
|
||||
|
||||
Die Implementierung ist Framework-unabh├дngig und funktioniert f├╝r PyTorch- und TensorFlow-Modelle. Wenn wir dies in einer Datei
|
||||
einer Datei namens `pair_classification.py` gespeichert haben, k├╢nnen wir sie importieren und wie folgt registrieren:
|
||||
|
||||
```py
|
||||
from pair_classification import PairClassificationPipeline
|
||||
from transformers.pipelines import PIPELINE_REGISTRY
|
||||
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
|
||||
|
||||
PIPELINE_REGISTRY.register_pipeline(
|
||||
"pair-classification",
|
||||
pipeline_class=PairClassificationPipeline,
|
||||
pt_model=AutoModelForSequenceClassification,
|
||||
tf_model=TFAutoModelForSequenceClassification,
|
||||
)
|
||||
```
|
||||
|
||||
Sobald dies geschehen ist, k├╢nnen wir es mit einem vortrainierten Modell verwenden. Zum Beispiel wurde `sgugger/finetuned-bert-mrpc` auf den
|
||||
auf den MRPC-Datensatz abgestimmt, der Satzpaare als Paraphrasen oder nicht klassifiziert.
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
|
||||
```
|
||||
|
||||
Dann k├╢nnen wir sie auf dem Hub mit der Methode `save_pretrained` in einem `Repository` freigeben:
|
||||
|
||||
```py
|
||||
from huggingface_hub import Repository
|
||||
|
||||
repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline")
|
||||
classifier.save_pretrained("test-dynamic-pipeline")
|
||||
repo.push_to_hub()
|
||||
```
|
||||
|
||||
Dadurch wird die Datei, in der Sie `PairClassificationPipeline` definiert haben, in den Ordner `"test-dynamic-pipeline"` kopiert,
|
||||
und speichert das Modell und den Tokenizer der Pipeline, bevor Sie alles in das Repository verschieben
|
||||
`{Ihr_Benutzername}/test-dynamic-pipeline`. Danach kann jeder die Pipeline verwenden, solange er die Option
|
||||
`trust_remote_code=True` angeben:
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
|
||||
```
|
||||
|
||||
## Hinzuf├╝gen der Pipeline zu ЁЯдЧ Transformers
|
||||
|
||||
Wenn Sie Ihre Pipeline zu ЁЯдЧ Transformers beitragen m├╢chten, m├╝ssen Sie ein neues Modul im Untermodul `pipelines` hinzuf├╝gen
|
||||
mit dem Code Ihrer Pipeline hinzuf├╝gen. F├╝gen Sie es dann der Liste der in `pipelines/__init__.py` definierten Aufgaben hinzu.
|
||||
|
||||
Dann m├╝ssen Sie noch Tests hinzuf├╝gen. Erstellen Sie eine neue Datei `tests/test_pipelines_MY_PIPELINE.py` mit Beispielen f├╝r die anderen Tests.
|
||||
|
||||
Die Funktion `run_pipeline_test` ist sehr allgemein gehalten und l├дuft auf kleinen Zufallsmodellen auf jeder m├╢glichen
|
||||
Architektur, wie durch `model_mapping` und `tf_model_mapping` definiert.
|
||||
|
||||
Dies ist sehr wichtig, um die zuk├╝nftige Kompatibilit├дt zu testen, d.h. wenn jemand ein neues Modell f├╝r
|
||||
`XXXForQuestionAnswering` hinzuf├╝gt, wird der Pipeline-Test versuchen, mit diesem Modell zu arbeiten. Da die Modelle zuf├дllig sind, ist es
|
||||
ist es unm├╢glich, die tats├дchlichen Werte zu ├╝berpr├╝fen. Deshalb gibt es eine Hilfsfunktion `ANY`, die einfach versucht, die
|
||||
Ausgabe der Pipeline TYPE.
|
||||
|
||||
Au├Яerdem *m├╝ssen* Sie 2 (idealerweise 4) Tests implementieren.
|
||||
|
||||
- test_small_model_pt` : Definieren Sie 1 kleines Modell f├╝r diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben)
|
||||
und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_tf`.
|
||||
- test_small_model_tf : Definieren Sie 1 kleines Modell f├╝r diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben)
|
||||
und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_pt`.
|
||||
- test_large_model_pt` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse
|
||||
Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu pr├дsentieren und sicherzustellen
|
||||
sicherzustellen, dass es in zuk├╝nftigen Versionen keine Abweichungen gibt.
|
||||
- test_large_model_tf` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse
|
||||
Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu pr├дsentieren und sicherzustellen
|
||||
sicherzustellen, dass es in zuk├╝nftigen Versionen keine Abweichungen gibt.
|
||||
356
docs/source/de/add_tensorflow_model.md
Normal file
356
docs/source/de/add_tensorflow_model.md
Normal file
@ -0,0 +1,356 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Wie konvertiert man ein ЁЯдЧ Transformers-Modell in TensorFlow?
|
||||
|
||||
Die Tatsache, dass mehrere Frameworks f├╝r die Verwendung mit ЁЯдЧ Transformers zur Verf├╝gung stehen, gibt Ihnen die Flexibilit├дt, deren St├дrken beim Entwurf Ihrer Anwendung auszuspielen.
|
||||
Ihre Anwendung zu entwerfen, aber das bedeutet auch, dass die Kompatibilit├дt f├╝r jedes Modell einzeln hinzugef├╝gt werden muss. Die gute Nachricht ist, dass
|
||||
das Hinzuf├╝gen von TensorFlow-Kompatibilit├дt zu einem bestehenden Modell einfacher ist als [das Hinzuf├╝gen eines neuen Modells von Grund auf](add_new_model)!
|
||||
Ob Sie ein tieferes Verst├дndnis f├╝r gro├Яe TensorFlow-Modelle haben m├╢chten, einen wichtigen Open-Source-Beitrag leisten oder
|
||||
TensorFlow f├╝r das Modell Ihrer Wahl aktivieren wollen, dieser Leitfaden ist f├╝r Sie.
|
||||
|
||||
Dieser Leitfaden bef├дhigt Sie, ein Mitglied unserer Gemeinschaft, TensorFlow-Modellgewichte und/oder
|
||||
Architekturen beizusteuern, die in ЁЯдЧ Transformers verwendet werden sollen, und zwar mit minimaler Betreuung durch das Hugging Face Team. Das Schreiben eines neuen Modells
|
||||
ist keine Kleinigkeit, aber ich hoffe, dass dieser Leitfaden dazu beitr├дgt, dass es weniger eine Achterbahnfahrt ЁЯОв und mehr ein Spaziergang im Park ЁЯЪ╢ ist.
|
||||
Die Nutzung unserer kollektiven Erfahrungen ist absolut entscheidend, um diesen Prozess immer einfacher zu machen, und deshalb m├╢chten wir
|
||||
ermutigen Sie daher, Verbesserungsvorschl├дge f├╝r diesen Leitfaden zu machen!
|
||||
|
||||
Bevor Sie tiefer eintauchen, empfehlen wir Ihnen, die folgenden Ressourcen zu lesen, wenn Sie neu in ЁЯдЧ Transformers sind:
|
||||
- [Allgemeiner ├Ьberblick ├╝ber ЁЯдЧ Transformers](add_new_model#general-overview-of-transformers)
|
||||
- [Die TensorFlow-Philosophie von Hugging Face](https://huggingface.co/blog/tensorflow-philosophy)
|
||||
|
||||
Im Rest dieses Leitfadens werden Sie lernen, was n├╢tig ist, um eine neue TensorFlow Modellarchitektur hinzuzuf├╝gen, die
|
||||
Verfahren zur Konvertierung von PyTorch in TensorFlow-Modellgewichte und wie Sie Unstimmigkeiten zwischen ML
|
||||
Frameworks. Legen Sie los!
|
||||
|
||||
<Tip>
|
||||
|
||||
Sind Sie unsicher, ob das Modell, das Sie verwenden m├╢chten, bereits eine entsprechende TensorFlow-Architektur hat?
|
||||
|
||||
|
||||
|
||||
├Ьberpr├╝fen Sie das Feld `model_type` in der `config.json` des Modells Ihrer Wahl
|
||||
([Beispiel](https://huggingface.co/bert-base-uncased/blob/main/config.json#L14)). Wenn der entsprechende Modellordner in
|
||||
ЁЯдЧ Transformers eine Datei hat, deren Name mit "modeling_tf" beginnt, bedeutet dies, dass es eine entsprechende TensorFlow
|
||||
Architektur hat ([Beispiel](https://github.com/huggingface/transformers/tree/main/src/transformers/models/bert)).
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
## Schritt-f├╝r-Schritt-Anleitung zum Hinzuf├╝gen von TensorFlow-Modellarchitektur-Code
|
||||
|
||||
Es gibt viele M├╢glichkeiten, eine gro├Яe Modellarchitektur zu entwerfen, und viele M├╢glichkeiten, diesen Entwurf zu implementieren. Wie auch immer,
|
||||
Sie erinnern sich vielleicht an unseren [allgemeinen ├Ьberblick ├╝ber ЁЯдЧ Transformers](add_new_model#general-overview-of-transformers)
|
||||
wissen, dass wir ein meinungsfreudiger Haufen sind - die Benutzerfreundlichkeit von ЁЯдЧ Transformers h├дngt von konsistenten Designentscheidungen ab. Aus
|
||||
Erfahrung k├╢nnen wir Ihnen ein paar wichtige Dinge ├╝ber das Hinzuf├╝gen von TensorFlow-Modellen sagen:
|
||||
|
||||
- Erfinden Sie das Rad nicht neu! In den meisten F├дllen gibt es mindestens zwei Referenzimplementierungen, die Sie ├╝berpr├╝fen sollten: das
|
||||
PyTorch-├Дquivalent des Modells, das Sie implementieren, und andere TensorFlow-Modelle f├╝r dieselbe Klasse von Problemen.
|
||||
- Gute Modellimplementierungen ├╝berleben den Test der Zeit. Dies geschieht nicht, weil der Code h├╝bsch ist, sondern eher
|
||||
sondern weil der Code klar, einfach zu debuggen und darauf aufzubauen ist. Wenn Sie den Maintainern das Leben mit Ihrer
|
||||
TensorFlow-Implementierung leicht machen, indem Sie die gleichen Muster wie in anderen TensorFlow-Modellen nachbilden und die Abweichung
|
||||
zur PyTorch-Implementierung minimieren, stellen Sie sicher, dass Ihr Beitrag lange Bestand haben wird.
|
||||
- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Das ЁЯдЧ Transformers-Team ist da, um zu helfen, und wir haben wahrscheinlich L├╢sungen f├╝r die gleichen
|
||||
Probleme gefunden, vor denen Sie stehen.
|
||||
|
||||
Hier finden Sie einen ├Ьberblick ├╝ber die Schritte, die zum Hinzuf├╝gen einer TensorFlow-Modellarchitektur erforderlich sind:
|
||||
1. W├дhlen Sie das Modell, das Sie konvertieren m├╢chten
|
||||
2. Bereiten Sie die Transformers-Entwicklungsumgebung vor.
|
||||
3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung
|
||||
4. Implementieren Sie die Modellarchitektur
|
||||
5. Implementieren Sie Modelltests
|
||||
6. Reichen Sie den Pull-Antrag ein
|
||||
7. (Optional) Erstellen Sie Demos und teilen Sie diese mit der Welt
|
||||
|
||||
### 1.-3. Bereiten Sie Ihren Modellbeitrag vor
|
||||
|
||||
**1. W├дhlen Sie das Modell, das Sie konvertieren m├╢chten**
|
||||
|
||||
Beginnen wir mit den Grundlagen: Als erstes m├╝ssen Sie die Architektur kennen, die Sie konvertieren m├╢chten. Wenn Sie
|
||||
Sie sich nicht auf eine bestimmte Architektur festgelegt haben, ist es eine gute M├╢glichkeit, das ЁЯдЧ Transformers-Team um Vorschl├дge zu bitten.
|
||||
Wir werden Sie zu den wichtigsten Architekturen f├╝hren, die auf der TensorFlow-Seite noch fehlen.
|
||||
Seite fehlen. Wenn das spezifische Modell, das Sie mit TensorFlow verwenden m├╢chten, bereits eine Implementierung der TensorFlow-Architektur in
|
||||
ЁЯдЧ Transformers, aber es fehlen Gewichte, k├╢nnen Sie direkt in den
|
||||
Abschnitt [Gewichtskonvertierung](#adding-tensorflow-weights-to-hub)
|
||||
auf dieser Seite.
|
||||
|
||||
Der Einfachheit halber wird im Rest dieser Anleitung davon ausgegangen, dass Sie sich entschieden haben, mit der TensorFlow-Version von
|
||||
*BrandNewBert* (dasselbe Beispiel wie in der [Anleitung](add_new_model), um ein neues Modell von Grund auf hinzuzuf├╝gen).
|
||||
|
||||
<Tip>
|
||||
|
||||
Bevor Sie mit der Arbeit an einer TensorFlow-Modellarchitektur beginnen, sollten Sie sich vergewissern, dass es keine laufenden Bem├╝hungen in dieser Richtung gibt.
|
||||
Sie k├╢nnen nach `BrandNewBert` auf der
|
||||
[pull request GitHub page](https://github.com/huggingface/transformers/pulls?q=is%3Apr), um zu best├дtigen, dass es keine
|
||||
TensorFlow-bezogene Pull-Anfrage gibt.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
**2. Transformers-Entwicklungsumgebung vorbereiten**
|
||||
|
||||
Nachdem Sie die Modellarchitektur ausgew├дhlt haben, ├╢ffnen Sie einen PR-Entwurf, um Ihre Absicht zu signalisieren, daran zu arbeiten. Folgen Sie den
|
||||
Anweisungen, um Ihre Umgebung einzurichten und einen PR-Entwurf zu ├╢ffnen.
|
||||
|
||||
1. Forken Sie das [repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die Schaltfl├дche 'Fork' klicken.
|
||||
Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt.
|
||||
|
||||
2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und f├╝gen Sie das Basis-Repository als Remote hinzu:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/[your Github handle]/transformers.git
|
||||
cd transformers
|
||||
git remote add upstream https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausf├╝hren:
|
||||
|
||||
```bash
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
Abh├дngig von Ihrem Betriebssystem und da die Anzahl der optionalen Abh├дngigkeiten von Transformers w├дchst, kann es sein, dass Sie bei diesem Befehl einen
|
||||
Fehler mit diesem Befehl erhalten. Wenn das der Fall ist, stellen Sie sicher, dass Sie TensorFlow installieren und dann ausf├╝hren:
|
||||
|
||||
```bash
|
||||
pip install -e ".[quality]"
|
||||
```
|
||||
|
||||
**Hinweis:** Sie m├╝ssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU laufen zu lassen.
|
||||
|
||||
4. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung
|
||||
|
||||
```bash
|
||||
git checkout -b add_tf_brand_new_bert
|
||||
```
|
||||
|
||||
5. Abrufen und zur├╝cksetzen auf die aktuelle Hauptversion
|
||||
|
||||
```bash
|
||||
git fetch upstream
|
||||
git rebase upstream/main
|
||||
```
|
||||
|
||||
6. F├╝gen Sie eine leere `.py` Datei in `transformers/src/models/brandnewbert/` mit dem Namen `modeling_tf_brandnewbert.py` hinzu. Dies wird
|
||||
Ihre TensorFlow-Modelldatei sein.
|
||||
|
||||
7. ├Ьbertragen Sie die ├Дnderungen auf Ihr Konto mit:
|
||||
|
||||
```bash
|
||||
git add .
|
||||
git commit -m "initial commit"
|
||||
git push -u origin add_tf_brand_new_bert
|
||||
```
|
||||
|
||||
8. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das
|
||||
GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzuf├╝gen, damit das Hugging Face-Team ├╝ber zuk├╝nftige ├Дnderungen informiert wird.
|
||||
zuk├╝nftige ├Дnderungen benachrichtigt wird.
|
||||
|
||||
9. ├Дndern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken.
|
||||
|
||||
|
||||
Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *BrandNewBert* nach TensorFlow in ЁЯдЧ Transformers zu portieren.
|
||||
|
||||
|
||||
**3. (Optional) Verstehen Sie die theoretischen Aspekte und die bestehende Implementierung**
|
||||
|
||||
Sie sollten sich etwas Zeit nehmen, um die Arbeit von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. M├╢glicherweise gibt es gro├Яe
|
||||
Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist
|
||||
ist es nicht, ein tiefes theoretisches Verst├дndnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um
|
||||
das Modell mit Hilfe von TensorFlow effektiv in ЁЯдЧ Transformers neu zu implementieren. Das hei├Яt, Sie m├╝ssen nicht zu viel Zeit auf die
|
||||
viel Zeit auf die theoretischen Aspekte verwenden, sondern sich lieber auf die praktischen Aspekte konzentrieren, n├дmlich auf die bestehende Modelldokumentation
|
||||
Seite (z.B. [model docs for BERT](model_doc/bert)).
|
||||
|
||||
Nachdem Sie die Grundlagen der Modelle, die Sie implementieren wollen, verstanden haben, ist es wichtig, die bestehende
|
||||
Implementierung zu verstehen. Dies ist eine gute Gelegenheit, sich zu vergewissern, dass eine funktionierende Implementierung mit Ihren Erwartungen an das
|
||||
Modell entspricht, und um technische Herausforderungen auf der TensorFlow-Seite vorauszusehen.
|
||||
|
||||
Es ist ganz nat├╝rlich, dass Sie sich von der Menge an Informationen, die Sie gerade aufgesogen haben, ├╝berw├дltigt f├╝hlen. Es ist
|
||||
Es ist definitiv nicht erforderlich, dass Sie in dieser Phase alle Facetten des Modells verstehen. Dennoch empfehlen wir Ihnen dringend
|
||||
ermutigen wir Sie, alle dringenden Fragen in unserem [Forum](https://discuss.huggingface.co/) zu kl├дren.
|
||||
|
||||
|
||||
### 4. Implementierung des Modells
|
||||
|
||||
Jetzt ist es an der Zeit, endlich mit dem Programmieren zu beginnen. Als Ausgangspunkt empfehlen wir die PyTorch-Datei selbst: Kopieren Sie den Inhalt von
|
||||
modeling_brand_new_bert.py` in `src/transformers/models/brand_new_bert/` nach
|
||||
modeling_tf_brand_new_bert.py`. Das Ziel dieses Abschnitts ist es, die Datei zu ├дndern und die Importstruktur von
|
||||
ЁЯдЧ Transformers zu aktualisieren, so dass Sie `TFBrandNewBert` und
|
||||
`TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` erfolgreich ein funktionierendes TensorFlow *BrandNewBert* Modell l├дdt.
|
||||
|
||||
Leider gibt es kein Rezept, um ein PyTorch-Modell in TensorFlow zu konvertieren. Sie k├╢nnen jedoch unsere Auswahl an
|
||||
Tipps befolgen, um den Prozess so reibungslos wie m├╢glich zu gestalten:
|
||||
- Stellen Sie `TF` dem Namen aller Klassen voran (z.B. wird `BrandNewBert` zu `TFBrandNewBert`).
|
||||
- Die meisten PyTorch-Operationen haben einen direkten TensorFlow-Ersatz. Zum Beispiel entspricht `torch.nn.Linear` der Klasse
|
||||
`tf.keras.layers.Dense`, `torch.nn.Dropout` entspricht `tf.keras.layers.Dropout`, usw. Wenn Sie sich nicht sicher sind
|
||||
├╝ber eine bestimmte Operation nicht sicher sind, k├╢nnen Sie die [TensorFlow-Dokumentation](https://www.tensorflow.org/api_docs/python/tf)
|
||||
oder die [PyTorch-Dokumentation](https://pytorch.org/docs/stable/).
|
||||
- Suchen Sie nach Mustern in der Codebasis von ЁЯдЧ Transformers. Wenn Sie auf eine bestimmte Operation sto├Яen, f├╝r die es keinen direkten Ersatz gibt
|
||||
Ersatz hat, stehen die Chancen gut, dass jemand anderes bereits das gleiche Problem hatte.
|
||||
- Behalten Sie standardm├д├Яig die gleichen Variablennamen und die gleiche Struktur wie in PyTorch bei. Dies erleichtert die Fehlersuche, die Verfolgung von
|
||||
Probleme zu verfolgen und sp├дtere Korrekturen vorzunehmen.
|
||||
- Einige Ebenen haben in jedem Framework unterschiedliche Standardwerte. Ein bemerkenswertes Beispiel ist die Schicht f├╝r die Batch-Normalisierung
|
||||
epsilon (`1e-5` in [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d)
|
||||
und `1e-3` in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization)).
|
||||
Pr├╝fen Sie die Dokumentation genau!
|
||||
- Die Variablen `nn.Parameter` von PyTorch m├╝ssen in der Regel innerhalb von TF Layer's `build()` initialisiert werden. Siehe das folgende
|
||||
Beispiel: [PyTorch](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_vit_mae.py#L212) /
|
||||
[TensorFlow](https://github.com/huggingface/transformers/blob/655f72a6896c0533b1bdee519ed65a059c2425ac/src/transformers/models/vit_mae/modeling_tf_vit_mae.py#L220)
|
||||
- Wenn das PyTorch-Modell ein `#copied from ...` am Anfang einer Funktion hat, stehen die Chancen gut, dass Ihr TensorFlow-Modell diese Funktion auch
|
||||
diese Funktion von der Architektur ausleihen kann, von der sie kopiert wurde, vorausgesetzt, es hat eine TensorFlow-Architektur.
|
||||
- Die korrekte Zuweisung des Attributs `name` in TensorFlow-Funktionen ist entscheidend, um das `from_pt=True` Gewicht zu erreichen
|
||||
Cross-Loading. Name" ist fast immer der Name der entsprechenden Variablen im PyTorch-Code. Wenn `name` nicht
|
||||
nicht richtig gesetzt ist, sehen Sie dies in der Fehlermeldung beim Laden der Modellgewichte.
|
||||
- Die Logik der Basismodellklasse, `BrandNewBertModel`, befindet sich in `TFBrandNewBertMainLayer`, einer Keras
|
||||
Schicht-Unterklasse ([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L719)).
|
||||
TFBrandNewBertModel" ist lediglich ein Wrapper f├╝r diese Schicht.
|
||||
- Keras-Modelle m├╝ssen erstellt werden, um die vorher trainierten Gewichte zu laden. Aus diesem Grund muss `TFBrandNewBertPreTrainedModel`
|
||||
ein Beispiel f├╝r die Eingaben in das Modell enthalten, die `dummy_inputs`
|
||||
([Beispiel](https://github.com/huggingface/transformers/blob/4fd32a1f499e45f009c2c0dea4d81c321cba7e02/src/transformers/models/bert/modeling_tf_bert.py#L916)).
|
||||
- Wenn Sie nicht weiterkommen, fragen Sie nach Hilfe - wir sind f├╝r Sie da! ЁЯдЧ
|
||||
|
||||
Neben der Modelldatei selbst m├╝ssen Sie auch die Verweise auf die Modellklassen und die zugeh├╢rigen
|
||||
Dokumentationsseiten hinzuf├╝gen. Sie k├╢nnen diesen Teil ganz nach den Mustern in anderen PRs erledigen
|
||||
([Beispiel](https://github.com/huggingface/transformers/pull/18020/files)). Hier ist eine Liste der erforderlichen manuellen
|
||||
├Дnderungen:
|
||||
- F├╝gen Sie alle ├╢ffentlichen Klassen von *BrandNewBert* in `src/transformers/__init__.py` ein.
|
||||
- F├╝gen Sie *BrandNewBert* Klassen zu den entsprechenden Auto Klassen in `src/transformers/models/auto/modeling_tf_auto.py` hinzu.
|
||||
- F├╝gen Sie die *BrandNewBert* zugeh├╢rigen Klassen f├╝r tr├дges Laden in `src/transformers/utils/dummy_tf_objects.py` hinzu.
|
||||
- Aktualisieren Sie die Importstrukturen f├╝r die ├╢ffentlichen Klassen in `src/transformers/models/brand_new_bert/__init__.py`.
|
||||
- F├╝gen Sie die Dokumentationszeiger auf die ├╢ffentlichen Methoden von *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu.
|
||||
- F├╝gen Sie sich selbst zur Liste der Mitwirkenden an *BrandNewBert* in `docs/source/de/model_doc/brand_new_bert.md` hinzu.
|
||||
- F├╝gen Sie schlie├Яlich ein gr├╝nes H├дkchen тЬЕ in der TensorFlow-Spalte von *BrandNewBert* in `docs/source/de/index.md` hinzu.
|
||||
|
||||
Wenn Sie mit Ihrer Implementierung zufrieden sind, f├╝hren Sie die folgende Checkliste aus, um zu best├дtigen, dass Ihre Modellarchitektur
|
||||
fertig ist:
|
||||
1. Alle Schichten, die sich zur Trainingszeit anders verhalten (z.B. Dropout), werden mit einem `Training` Argument aufgerufen, das
|
||||
von den Top-Level-Klassen weitergegeben wird
|
||||
2. Sie haben `#copied from ...` verwendet, wann immer es m├╢glich war.
|
||||
3. Die Funktion `TFBrandNewBertMainLayer` und alle Klassen, die sie verwenden, haben ihre Funktion `call` mit `@unpack_inputs` dekoriert
|
||||
4. TFBrandNewBertMainLayer` ist mit `@keras_serializable` dekoriert
|
||||
5. Ein TensorFlow-Modell kann aus PyTorch-Gewichten mit `TFBrandNewBert.from_pretrained(model_repo, from_pt=True)` geladen werden.
|
||||
6. Sie k├╢nnen das TensorFlow Modell mit dem erwarteten Eingabeformat aufrufen
|
||||
|
||||
|
||||
### 5. Modell-Tests hinzuf├╝gen
|
||||
|
||||
Hurra, Sie haben ein TensorFlow-Modell implementiert! Jetzt ist es an der Zeit, Tests hinzuzuf├╝gen, um sicherzustellen, dass sich Ihr Modell wie erwartet verh├дlt.
|
||||
erwartet. Wie im vorigen Abschnitt schlagen wir vor, dass Sie zun├дchst die Datei `test_modeling_brand_new_bert.py` in
|
||||
`tests/models/brand_new_bert/` in die Datei `test_modeling_tf_brand_new_bert.py` zu kopieren und dann die notwendigen
|
||||
TensorFlow-Ersetzungen vornehmen. F├╝r den Moment sollten Sie in allen Aufrufen von `.from_pretrained()` das Flag `from_pt=True` verwenden, um die
|
||||
die vorhandenen PyTorch-Gewichte zu laden.
|
||||
|
||||
Wenn Sie damit fertig sind, kommt der Moment der Wahrheit: F├╝hren Sie die Tests durch! ЁЯШм
|
||||
|
||||
```bash
|
||||
NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
|
||||
py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
|
||||
```
|
||||
|
||||
Das wahrscheinlichste Ergebnis ist, dass Sie eine Reihe von Fehlern sehen werden. Machen Sie sich keine Sorgen, das ist zu erwarten! Das Debuggen von ML-Modellen ist
|
||||
notorisch schwierig, und der Schl├╝ssel zum Erfolg ist Geduld (und `breakpoint()`). Nach unserer Erfahrung sind die schwierigsten
|
||||
Probleme aus subtilen Unstimmigkeiten zwischen ML-Frameworks, zu denen wir am Ende dieses Leitfadens ein paar Hinweise geben.
|
||||
In anderen F├дllen kann es sein, dass ein allgemeiner Test nicht direkt auf Ihr Modell anwendbar ist; in diesem Fall empfehlen wir eine ├Ьberschreibung
|
||||
auf der Ebene der Modelltestklasse. Z├╢gern Sie nicht, in Ihrem Entwurf einer Pull-Anfrage um Hilfe zu bitten, wenn
|
||||
Sie nicht weiterkommen.
|
||||
|
||||
Wenn alle Tests erfolgreich waren, k├╢nnen Sie Ihr Modell in die ЁЯдЧ Transformers-Bibliothek aufnehmen! ЁЯОЙ
|
||||
|
||||
### 6.-7. Stellen Sie sicher, dass jeder Ihr Modell verwenden kann
|
||||
|
||||
**6. Reichen Sie den Pull Request ein**
|
||||
|
||||
Sobald Sie mit der Implementierung und den Tests fertig sind, ist es an der Zeit, eine Pull-Anfrage einzureichen. Bevor Sie Ihren Code einreichen,
|
||||
f├╝hren Sie unser Dienstprogramm zur Codeformatierung, `make fixup` ЁЯкД, aus. Damit werden automatisch alle Formatierungsfehler behoben, die dazu f├╝hren w├╝rden, dass
|
||||
unsere automatischen Pr├╝fungen fehlschlagen w├╝rden.
|
||||
|
||||
Nun ist es an der Zeit, Ihren Entwurf einer Pull-Anfrage in eine echte Pull-Anfrage umzuwandeln. Klicken Sie dazu auf die Schaltfl├дche "Bereit f├╝r
|
||||
Review" und f├╝gen Sie Joao (`@gante`) und Matt (`@Rocketknight1`) als Reviewer hinzu. Eine Modell-Pull-Anfrage ben├╢tigt
|
||||
mindestens 3 Reviewer, aber sie werden sich darum k├╝mmern, geeignete zus├дtzliche Reviewer f├╝r Ihr Modell zu finden.
|
||||
|
||||
Nachdem alle Gutachter mit dem Stand Ihres PR zufrieden sind, entfernen Sie als letzten Aktionspunkt das Flag `from_pt=True` in
|
||||
.from_pretrained()-Aufrufen zu entfernen. Da es keine TensorFlow-Gewichte gibt, m├╝ssen Sie sie hinzuf├╝gen! Lesen Sie den Abschnitt
|
||||
unten, um zu erfahren, wie Sie dies tun k├╢nnen.
|
||||
|
||||
Wenn schlie├Яlich die TensorFlow-Gewichte zusammengef├╝hrt werden, Sie mindestens 3 Genehmigungen von Pr├╝fern haben und alle CI-Checks gr├╝n sind
|
||||
gr├╝n sind, ├╝berpr├╝fen Sie die Tests ein letztes Mal lokal
|
||||
|
||||
```bash
|
||||
NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \
|
||||
py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py
|
||||
```
|
||||
|
||||
und wir werden Ihren PR zusammenf├╝hren! Herzlichen Gl├╝ckwunsch zu dem Meilenstein ЁЯОЙ.
|
||||
|
||||
**7. (Optional) Erstellen Sie Demos und teilen Sie sie mit der Welt**
|
||||
|
||||
Eine der schwierigsten Aufgaben bei Open-Source ist die Entdeckung. Wie k├╢nnen die anderen Benutzer von der Existenz Ihres
|
||||
fabelhaften TensorFlow-Beitrags erfahren? Mit der richtigen Kommunikation, nat├╝rlich! ЁЯУг
|
||||
|
||||
Es gibt vor allem zwei M├╢glichkeiten, Ihr Modell mit der Community zu teilen:
|
||||
- Erstellen Sie Demos. Dazu geh├╢ren Gradio-Demos, Notebooks und andere unterhaltsame M├╢glichkeiten, Ihr Modell vorzuf├╝hren. Wir raten Ihnen
|
||||
ermutigen Sie, ein Notizbuch zu unseren [community-driven demos](https://huggingface.co/docs/transformers/community) hinzuzuf├╝gen.
|
||||
- Teilen Sie Geschichten in sozialen Medien wie Twitter und LinkedIn. Sie sollten stolz auf Ihre Arbeit sein und sie mit der
|
||||
Ihre Leistung mit der Community teilen - Ihr Modell kann nun von Tausenden von Ingenieuren und Forschern auf der ganzen Welt genutzt werden
|
||||
der Welt genutzt werden ЁЯМН! Wir werden Ihre Beitr├дge gerne retweeten und Ihnen helfen, Ihre Arbeit mit der Community zu teilen.
|
||||
|
||||
|
||||
## Hinzuf├╝gen von TensorFlow-Gewichten zum ЁЯдЧ Hub
|
||||
|
||||
Unter der Annahme, dass die TensorFlow-Modellarchitektur in ЁЯдЧ Transformers verf├╝gbar ist, ist die Umwandlung von PyTorch-Gewichten in
|
||||
TensorFlow-Gewichte ist ein Kinderspiel!
|
||||
|
||||
Hier sehen Sie, wie es geht:
|
||||
1. Stellen Sie sicher, dass Sie in Ihrem Terminal bei Ihrem Hugging Face Konto angemeldet sind. Sie k├╢nnen sich mit dem folgenden Befehl anmelden
|
||||
`huggingface-cli login` (Ihre Zugangstoken finden Sie [hier](https://huggingface.co/settings/tokens))
|
||||
2. F├╝hren Sie `transformers-cli pt-to-tf --model-name foo/bar` aus, wobei `foo/bar` der Name des Modell-Repositorys ist
|
||||
ist, das die PyTorch-Gewichte enth├дlt, die Sie konvertieren m├╢chten.
|
||||
3. Markieren Sie `@joaogante` und `@Rocketknight1` in dem ЁЯдЧ Hub PR, den der obige Befehl gerade erstellt hat
|
||||
|
||||
Das war's! ЁЯОЙ
|
||||
|
||||
|
||||
## Fehlersuche in verschiedenen ML-Frameworks ЁЯРЫ
|
||||
|
||||
Irgendwann, wenn Sie eine neue Architektur hinzuf├╝gen oder TensorFlow-Gewichte f├╝r eine bestehende Architektur erstellen, werden Sie
|
||||
sto├Яen Sie vielleicht auf Fehler, die sich ├╝ber Unstimmigkeiten zwischen PyTorch und TensorFlow beschweren. Sie k├╢nnten sich sogar dazu entschlie├Яen, den
|
||||
Modellarchitektur-Code f├╝r die beiden Frameworks zu ├╢ffnen, und stellen fest, dass sie identisch aussehen. Was ist denn da los? ЁЯдФ
|
||||
|
||||
Lassen Sie uns zun├дchst dar├╝ber sprechen, warum es wichtig ist, diese Diskrepanzen zu verstehen. Viele Community-Mitglieder werden ЁЯдЧ
|
||||
Transformers-Modelle und vertrauen darauf, dass sich unsere Modelle wie erwartet verhalten. Wenn es eine gro├Яe Diskrepanz gibt
|
||||
zwischen den beiden Frameworks auftritt, bedeutet dies, dass das Modell nicht der Referenzimplementierung f├╝r mindestens eines der Frameworks folgt.
|
||||
der Frameworks folgt. Dies kann zu stillen Fehlern f├╝hren, bei denen das Modell zwar l├дuft, aber eine schlechte Leistung aufweist. Dies ist
|
||||
wohl schlimmer als ein Modell, das ├╝berhaupt nicht l├дuft! Aus diesem Grund streben wir an, dass die Abweichung zwischen den Frameworks kleiner als
|
||||
1e-5" in allen Phasen des Modells.
|
||||
|
||||
Wie bei anderen numerischen Problemen auch, steckt der Teufel im Detail. Und wie bei jedem detailorientierten Handwerk ist die geheime
|
||||
Zutat hier Geduld. Hier ist unser Vorschlag f├╝r den Arbeitsablauf, wenn Sie auf diese Art von Problemen sto├Яen:
|
||||
1. Lokalisieren Sie die Quelle der Abweichungen. Das Modell, das Sie konvertieren, hat wahrscheinlich bis zu einem gewissen Punkt nahezu identische innere Variablen.
|
||||
bestimmten Punkt. Platzieren Sie `Breakpoint()`-Anweisungen in den Architekturen der beiden Frameworks und vergleichen Sie die Werte der
|
||||
numerischen Variablen von oben nach unten, bis Sie die Quelle der Probleme gefunden haben.
|
||||
2. Nachdem Sie nun die Ursache des Problems gefunden haben, setzen Sie sich mit dem ЁЯдЧ Transformers-Team in Verbindung. Es ist m├╢glich
|
||||
dass wir ein ├дhnliches Problem schon einmal gesehen haben und umgehend eine L├╢sung anbieten k├╢nnen. Als Ausweichm├╢glichkeit k├╢nnen Sie beliebte Seiten
|
||||
wie StackOverflow und GitHub-Probleme.
|
||||
3. Wenn keine L├╢sung in Sicht ist, bedeutet das, dass Sie tiefer gehen m├╝ssen. Die gute Nachricht ist, dass Sie das Problem gefunden haben.
|
||||
Problem ausfindig gemacht haben, so dass Sie sich auf die problematische Anweisung konzentrieren und den Rest des Modells ausblenden k├╢nnen! Die schlechte Nachricht ist
|
||||
dass Sie sich in die Quellimplementierung der besagten Anweisung einarbeiten m├╝ssen. In manchen F├дllen finden Sie vielleicht ein
|
||||
Problem mit einer Referenzimplementierung - verzichten Sie nicht darauf, ein Problem im Upstream-Repository zu ├╢ffnen.
|
||||
|
||||
In einigen F├дllen k├╢nnen wir nach R├╝cksprache mit dem ЁЯдЧ Transformers-Team zu dem Schluss kommen, dass die Behebung der Abweichung nicht machbar ist.
|
||||
Wenn die Abweichung in den Ausgabeschichten des Modells sehr klein ist (aber m├╢glicherweise gro├Я in den versteckten Zust├дnden), k├╢nnen wir
|
||||
k├╢nnten wir beschlie├Яen, sie zu ignorieren und das Modell zu verteilen. Die oben erw├дhnte CLI `pt-to-tf` hat ein `--max-error`
|
||||
Flag, um die Fehlermeldung bei der Gewichtskonvertierung zu ├╝berschreiben.
|
||||
221
docs/source/de/llm_tutorial.md
Normal file
221
docs/source/de/llm_tutorial.md
Normal file
@ -0,0 +1,221 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
|
||||
# Generation with LLMs
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
LLMs (Large Language Models) sind die Schl├╝sselkomponente bei der Texterstellung. Kurz gesagt, bestehen sie aus gro├Яen, vortrainierten Transformationsmodellen, die darauf trainiert sind, das n├дchste Wort (oder genauer gesagt Token) aus einem Eingabetext vorherzusagen. Da sie jeweils ein Token vorhersagen, m├╝ssen Sie etwas Aufw├дndigeres tun, um neue S├дtze zu generieren, als nur das Modell aufzurufen - Sie m├╝ssen eine autoregressive Generierung durchf├╝hren.
|
||||
|
||||
Die autoregressive Generierung ist ein Verfahren zur Inferenzzeit, bei dem ein Modell mit seinen eigenen generierten Ausgaben iterativ aufgerufen wird, wenn einige anf├дngliche Eingaben vorliegen. In ЁЯдЧ Transformers wird dies von der Methode [`~generation.GenerationMixin.generate`] ├╝bernommen, die allen Modellen mit generativen F├дhigkeiten zur Verf├╝gung steht.
|
||||
|
||||
Dieses Tutorial zeigt Ihnen, wie Sie:
|
||||
|
||||
* Text mit einem LLM generieren
|
||||
* Vermeiden Sie h├дufige Fallstricke
|
||||
* N├дchste Schritte, damit Sie das Beste aus Ihrem LLM herausholen k├╢nnen
|
||||
|
||||
Bevor Sie beginnen, stellen Sie sicher, dass Sie alle erforderlichen Bibliotheken installiert haben:
|
||||
|
||||
```bash
|
||||
pip install transformers bitsandbytes>=0.39.0 -q
|
||||
```
|
||||
|
||||
|
||||
## Text generieren
|
||||
|
||||
Ein Sprachmodell, das f├╝r [causal language modeling](tasks/language_modeling) trainiert wurde, nimmt eine Folge von Text-Token als Eingabe und gibt die Wahrscheinlichkeitsverteilung f├╝r das n├дchste Token zur├╝ck.
|
||||
|
||||
<!-- [GIF 1 -- FWD PASS] -->
|
||||
<figure class="image table text-center m-0 w-full">
|
||||
<video
|
||||
style="max-width: 90%; margin: auto;"
|
||||
autoplay loop muted playsinline
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
|
||||
></video>
|
||||
<figcaption>"Forward pass of an LLM"</figcaption>
|
||||
</figure>
|
||||
|
||||
Ein wichtiger Aspekt der autoregressiven Generierung mit LLMs ist die Auswahl des n├дchsten Tokens aus dieser Wahrscheinlichkeitsverteilung. In diesem Schritt ist alles m├╢glich, solange Sie am Ende ein Token f├╝r die n├дchste Iteration haben. Das hei├Яt, es kann so einfach sein wie die Auswahl des wahrscheinlichsten Tokens aus der Wahrscheinlichkeitsverteilung oder so komplex wie die Anwendung von einem Dutzend Transformationen vor der Stichprobenziehung aus der resultierenden Verteilung.
|
||||
|
||||
<!-- [GIF 2 -- TEXT GENERATION] -->
|
||||
<figure class="image table text-center m-0 w-full">
|
||||
<video
|
||||
style="max-width: 90%; margin: auto;"
|
||||
autoplay loop muted playsinline
|
||||
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
|
||||
></video>
|
||||
<figcaption>"Die autoregressive Generierung w├дhlt iterativ das n├дchste Token aus einer Wahrscheinlichkeitsverteilung aus, um Text zu erzeugen"</figcaption>
|
||||
</figure>
|
||||
|
||||
Der oben dargestellte Prozess wird iterativ wiederholt, bis eine bestimmte Abbruchbedingung erreicht ist. Im Idealfall wird die Abbruchbedingung vom Modell vorgegeben, das lernen sollte, wann es ein Ende-der-Sequenz-Token (EOS) ausgeben muss. Ist dies nicht der Fall, stoppt die Generierung, wenn eine vordefinierte Maximall├дnge erreicht ist.
|
||||
|
||||
Damit sich Ihr Modell so verh├дlt, wie Sie es f├╝r Ihre Aufgabe erwarten, m├╝ssen Sie den Schritt der Token-Auswahl und die Abbruchbedingung richtig einstellen. Aus diesem Grund haben wir zu jedem Modell eine [`~generation.GenerationConfig`]-Datei, die eine gute generative Standardparametrisierung enth├дlt und zusammen mit Ihrem Modell geladen wird.
|
||||
|
||||
Lassen Sie uns ├╝ber Code sprechen!
|
||||
|
||||
<Tip>
|
||||
|
||||
Wenn Sie an der grundlegenden Verwendung von LLMs interessiert sind, ist unsere High-Level-Schnittstelle [`Pipeline`](pipeline_tutorial) ein guter Ausgangspunkt. LLMs erfordern jedoch oft fortgeschrittene Funktionen wie Quantisierung und Feinsteuerung des Token-Auswahlschritts, was am besten ├╝ber [`~generation.GenerationMixin.generate`] erfolgt. Die autoregressive Generierung mit LLMs ist ebenfalls ressourcenintensiv und sollte f├╝r einen angemessenen Durchsatz auf einer GPU ausgef├╝hrt werden.
|
||||
|
||||
</Tip>
|
||||
|
||||
<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
|
||||
Zun├дchst m├╝ssen Sie das Modell laden.
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForCausalLM
|
||||
|
||||
>>> model = AutoModelForCausalLM.from_pretrained(
|
||||
... "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
|
||||
... )
|
||||
```
|
||||
|
||||
Sie werden zwei Flags in dem Aufruf `from_pretrained` bemerken:
|
||||
|
||||
- `device_map` stellt sicher, dass das Modell auf Ihre GPU(s) ├╝bertragen wird
|
||||
- `load_in_4bit` wendet [dynamische 4-Bit-Quantisierung](main_classes/quantization) an, um die Ressourcenanforderungen massiv zu reduzieren
|
||||
|
||||
Es gibt noch andere M├╢glichkeiten, ein Modell zu initialisieren, aber dies ist eine gute Grundlage, um mit einem LLM zu beginnen.
|
||||
|
||||
Als n├дchstes m├╝ssen Sie Ihre Texteingabe mit einem [tokenizer](tokenizer_summary) vorverarbeiten.
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
|
||||
>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
|
||||
```
|
||||
|
||||
Die Variable `model_inputs` enth├дlt die tokenisierte Texteingabe sowie die Aufmerksamkeitsmaske. Obwohl [`~generation.GenerationMixin.generate`] sein Bestes tut, um die Aufmerksamkeitsmaske abzuleiten, wenn sie nicht ├╝bergeben wird, empfehlen wir, sie f├╝r optimale Ergebnisse wann immer m├╢glich zu ├╝bergeben.
|
||||
|
||||
Rufen Sie schlie├Яlich die Methode [~generation.GenerationMixin.generate] auf, um die generierten Token zur├╝ckzugeben, die vor dem Drucken in Text umgewandelt werden sollten.
|
||||
|
||||
```py
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'A list of colors: red, blue, green, yellow, black, white, and brown'
|
||||
```
|
||||
|
||||
Und das war's! Mit ein paar Zeilen Code k├╢nnen Sie sich die Macht eines LLM zunutze machen.
|
||||
|
||||
|
||||
## H├дufige Fallstricke
|
||||
|
||||
Es gibt viele [Generierungsstrategien](generation_strategies), und manchmal sind die Standardwerte f├╝r Ihren Anwendungsfall vielleicht nicht geeignet. Wenn Ihre Ausgaben nicht mit dem ├╝bereinstimmen, was Sie erwarten, haben wir eine Liste der h├дufigsten Fallstricke erstellt und wie Sie diese vermeiden k├╢nnen.
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
|
||||
>>> tokenizer.pad_token = tokenizer.eos_token # Llama has no pad token by default
|
||||
>>> model = AutoModelForCausalLM.from_pretrained(
|
||||
... "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
|
||||
... )
|
||||
```
|
||||
|
||||
### Generierte Ausgabe ist zu kurz/lang
|
||||
|
||||
Wenn in der Datei [~generation.GenerationConfig`] nichts angegeben ist, gibt `generate` standardm├д├Яig bis zu 20 Token zur├╝ck. Wir empfehlen dringend, `max_new_tokens` in Ihrem `generate`-Aufruf manuell zu setzen, um die maximale Anzahl neuer Token zu kontrollieren, die zur├╝ckgegeben werden k├╢nnen. Beachten Sie, dass LLMs (genauer gesagt, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) auch die Eingabeaufforderung als Teil der Ausgabe zur├╝ckgeben.
|
||||
|
||||
|
||||
```py
|
||||
>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
|
||||
|
||||
>>> # By default, the output will contain up to 20 tokens
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'A sequence of numbers: 1, 2, 3, 4, 5'
|
||||
|
||||
>>> # Setting `max_new_tokens` allows you to control the maximum length
|
||||
>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
|
||||
```
|
||||
|
||||
### Falscher Generierungsmodus
|
||||
|
||||
Standardm├д├Яig und sofern nicht in der Datei [~generation.GenerationConfig`] angegeben, w├дhlt `generate` bei jeder Iteration das wahrscheinlichste Token aus (gierige Dekodierung). Je nach Aufgabe kann dies unerw├╝nscht sein; kreative Aufgaben wie Chatbots oder das Schreiben eines Aufsatzes profitieren vom Sampling. Andererseits profitieren Aufgaben, bei denen es auf die Eingabe ankommt, wie z.B. Audiotranskription oder ├Ьbersetzung, von der gierigen Dekodierung. Aktivieren Sie das Sampling mit `do_sample=True`. Mehr zu diesem Thema erfahren Sie in diesem [Blogbeitrag] (https://huggingface.co/blog/how-to-generate).
|
||||
|
||||
```py
|
||||
>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
|
||||
>>> from transformers import set_seed
|
||||
>>> set_seed(0)
|
||||
|
||||
>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
|
||||
|
||||
>>> # LLM + greedy decoding = repetitive, boring output
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'I am a cat. I am a cat. I am a cat. I am a cat'
|
||||
|
||||
>>> # With sampling, the output becomes more creative!
|
||||
>>> generated_ids = model.generate(**model_inputs, do_sample=True)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'I am a cat.\nI just need to be. I am always.\nEvery time'
|
||||
```
|
||||
|
||||
### Falsche Auff├╝llseite
|
||||
|
||||
LLMs sind [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)-Architekturen, d.h. sie iterieren weiter ├╝ber Ihre Eingabeaufforderung. Wenn Ihre Eingaben nicht die gleiche L├дnge haben, m├╝ssen sie aufgef├╝llt werden. Da LLMs nicht darauf trainiert sind, mit aufgef├╝llten Token fortzufahren, muss Ihre Eingabe links aufgef├╝llt werden. Vergessen Sie auch nicht, die Aufmerksamkeitsmaske an generate zu ├╝bergeben!
|
||||
|
||||
```py
|
||||
>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
|
||||
>>> # which is shorter, has padding on the right side. Generation fails.
|
||||
>>> model_inputs = tokenizer(
|
||||
... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
|
||||
... ).to("cuda")
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
|
||||
''
|
||||
|
||||
>>> # With left-padding, it works as expected!
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
|
||||
>>> tokenizer.pad_token = tokenizer.eos_token # Llama has no pad token by default
|
||||
>>> model_inputs = tokenizer(
|
||||
... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
|
||||
... ).to("cuda")
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'1, 2, 3, 4, 5, 6,'
|
||||
```
|
||||
|
||||
<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->
|
||||
|
||||
## Weitere Ressourcen
|
||||
|
||||
W├дhrend der Prozess der autoregressiven Generierung relativ einfach ist, kann die optimale Nutzung Ihres LLM ein schwieriges Unterfangen sein, da es viele bewegliche Teile gibt. F├╝r Ihre n├дchsten Schritte, die Ihnen helfen, tiefer in die LLM-Nutzung und das Verst├дndnis einzutauchen:
|
||||
|
||||
<!-- TODO: mit neuen Anleitungen vervollst├дndigen -->
|
||||
### Fortgeschrittene Nutzung generieren
|
||||
|
||||
1. [Leitfaden](generation_strategies) zur Steuerung verschiedener Generierungsmethoden, zur Einrichtung der Generierungskonfigurationsdatei und zum Streaming der Ausgabe;
|
||||
2. API-Referenz zu [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`] und [generate-bezogene Klassen](internal/generation_utils).
|
||||
|
||||
### LLM-Ranglisten
|
||||
|
||||
1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), das sich auf die Qualit├дt der Open-Source-Modelle konzentriert;
|
||||
2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), das sich auf den LLM-Durchsatz konzentriert.
|
||||
|
||||
### Latenz und Durchsatz
|
||||
|
||||
1. [Leitfaden](main_classes/quantization) zur dynamischen Quantisierung, der Ihnen zeigt, wie Sie Ihren Speicherbedarf drastisch reduzieren k├╢nnen.
|
||||
|
||||
### Verwandte Bibliotheken
|
||||
|
||||
1. [text-generation-inference](https://github.com/huggingface/text-generation-inference), ein produktionsreifer Server f├╝r LLMs;
|
||||
2. [`optimum`](https://github.com/huggingface/optimum), eine Erweiterung von ЁЯдЧ Transformers, die f├╝r bestimmte Hardware-Ger├дte optimiert.
|
||||
216
docs/source/de/peft.md
Normal file
216
docs/source/de/peft.md
Normal file
@ -0,0 +1,216 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
-->
|
||||
|
||||
# Adapter mit ЁЯдЧ PEFT laden
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Die [Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) Methoden frieren die vorab trainierten Modellparameter w├дhrend der Feinabstimmung ein und f├╝gen eine kleine Anzahl trainierbarer Parameter (die Adapter) hinzu. Die Adapter werden trainiert, um aufgabenspezifische Informationen zu lernen. Es hat sich gezeigt, dass dieser Ansatz sehr speichereffizient ist und weniger Rechenleistung beansprucht, w├дhrend die Ergebnisse mit denen eines vollst├дndig feinabgestimmten Modells vergleichbar sind.
|
||||
|
||||
Adapter, die mit PEFT trainiert wurden, sind in der Regel um eine Gr├╢├Яenordnung kleiner als das vollst├дndige Modell, so dass sie bequem gemeinsam genutzt, gespeichert und geladen werden k├╢nnen.
|
||||
|
||||
<div class="flex flex-col justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
|
||||
<figcaption class="text-center">Die Adaptergewichte f├╝r ein OPTForCausalLM-Modell, die auf dem Hub gespeichert sind, sind nur ~6MB gro├Я, verglichen mit der vollen Gr├╢├Яe der Modellgewichte, die ~700MB betragen k├╢nnen.</figcaption>
|
||||
</div>
|
||||
|
||||
Wenn Sie mehr ├╝ber die ЁЯдЧ PEFT-Bibliothek erfahren m├╢chten, sehen Sie sich die [Dokumentation](https://huggingface.co/docs/peft/index) an.
|
||||
|
||||
## Setup
|
||||
|
||||
Starten Sie mit der Installation von ЁЯдЧ PEFT:
|
||||
|
||||
```bash
|
||||
pip install peft
|
||||
```
|
||||
|
||||
Wenn Sie die brandneuen Funktionen ausprobieren m├╢chten, sollten Sie die Bibliothek aus dem Quellcode installieren:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/peft.git
|
||||
```
|
||||
|
||||
## Unterst├╝tzte PEFT-Modelle
|
||||
|
||||
Transformers unterst├╝tzt nativ einige PEFT-Methoden, d.h. Sie k├╢nnen lokal oder auf dem Hub gespeicherte Adaptergewichte laden und sie mit wenigen Zeilen Code einfach ausf├╝hren oder trainieren. Die folgenden Methoden werden unterst├╝tzt:
|
||||
|
||||
- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora)
|
||||
- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
|
||||
- [AdaLoRA](https://arxiv.org/abs/2303.10512)
|
||||
|
||||
Wenn Sie andere PEFT-Methoden, wie z.B. Prompt Learning oder Prompt Tuning, verwenden m├╢chten, oder ├╝ber die ЁЯдЧ PEFT-Bibliothek im Allgemeinen, lesen Sie bitte die [Dokumentation](https://huggingface.co/docs/peft/index).
|
||||
|
||||
|
||||
## Laden Sie einen PEFT-Adapter
|
||||
|
||||
Um ein PEFT-Adaptermodell von ЁЯдЧ Transformers zu laden und zu verwenden, stellen Sie sicher, dass das Hub-Repository oder das lokale Verzeichnis eine `adapter_config.json`-Datei und die Adaptergewichte enth├дlt, wie im obigen Beispielbild gezeigt. Dann k├╢nnen Sie das PEFT-Adaptermodell mit der Klasse `AutoModelFor` laden. Um zum Beispiel ein PEFT-Adaptermodell f├╝r die kausale Sprachmodellierung zu laden:
|
||||
|
||||
1. Geben Sie die PEFT-Modell-ID an.
|
||||
2. ├╝bergeben Sie es an die Klasse [`AutoModelForCausalLM`].
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
peft_model_id = "ybelkada/opt-350m-lora"
|
||||
model = AutoModelForCausalLM.from_pretrained(peft_model_id)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Sie k├╢nnen einen PEFT-Adapter entweder mit einer `AutoModelFor`-Klasse oder der Basismodellklasse wie `OPTForCausalLM` oder `LlamaForCausalLM` laden.
|
||||
|
||||
</Tip>
|
||||
|
||||
Sie k├╢nnen einen PEFT-Adapter auch laden, indem Sie die Methode `load_adapter` aufrufen:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
peft_model_id = "ybelkada/opt-350m-lora"
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
model.load_adapter(peft_model_id)
|
||||
```
|
||||
|
||||
## Laden in 8bit oder 4bit
|
||||
|
||||
Die `bitsandbytes`-Integration unterst├╝tzt Datentypen mit 8bit und 4bit Genauigkeit, was f├╝r das Laden gro├Яer Modelle n├╝tzlich ist, weil es Speicher spart (lesen Sie den `bitsandbytes`-Integrations [guide](./quantization#bitsandbytes-integration), um mehr zu erfahren). F├╝gen Sie die Parameter `load_in_8bit` oder `load_in_4bit` zu [`~PreTrainedModel.from_pretrained`] hinzu und setzen Sie `device_map="auto"`, um das Modell effektiv auf Ihre Hardware zu verteilen:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
peft_model_id = "ybelkada/opt-350m-lora"
|
||||
model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True)
|
||||
```
|
||||
|
||||
## Einen neuen Adapter hinzuf├╝gen
|
||||
|
||||
Sie k├╢nnen [`~peft.PeftModel.add_adapter`] verwenden, um einen neuen Adapter zu einem Modell mit einem bestehenden Adapter hinzuzuf├╝gen, solange der neue Adapter vom gleichen Typ ist wie der aktuelle Adapter. Wenn Sie zum Beispiel einen bestehenden LoRA-Adapter an ein Modell angeh├дngt haben:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
|
||||
from peft import PeftConfig
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
|
||||
lora_config = LoraConfig(
|
||||
target_modules=["q_proj", "k_proj"],
|
||||
init_lora_weights=False
|
||||
)
|
||||
|
||||
model.add_adapter(lora_config, adapter_name="adapter_1")
|
||||
```
|
||||
|
||||
Um einen neuen Adapter hinzuzuf├╝gen:
|
||||
|
||||
```py
|
||||
# attach new adapter with same config
|
||||
model.add_adapter(lora_config, adapter_name="adapter_2")
|
||||
```
|
||||
|
||||
Jetzt k├╢nnen Sie mit [`~peft.PeftModel.set_adapter`] festlegen, welcher Adapter verwendet werden soll:
|
||||
|
||||
```py
|
||||
# use adapter_1
|
||||
model.set_adapter("adapter_1")
|
||||
output = model.generate(**inputs)
|
||||
print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
|
||||
|
||||
# use adapter_2
|
||||
model.set_adapter("adapter_2")
|
||||
output_enabled = model.generate(**inputs)
|
||||
print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
## Aktivieren und Deaktivieren von Adaptern
|
||||
|
||||
Sobald Sie einen Adapter zu einem Modell hinzugef├╝gt haben, k├╢nnen Sie das Adaptermodul aktivieren oder deaktivieren. So aktivieren Sie das Adaptermodul:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
|
||||
from peft import PeftConfig
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
adapter_model_id = "ybelkada/opt-350m-lora"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
text = "Hello"
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
peft_config = PeftConfig.from_pretrained(adapter_model_id)
|
||||
|
||||
# to initiate with random weights
|
||||
peft_config.init_lora_weights = False
|
||||
|
||||
model.add_adapter(peft_config)
|
||||
model.enable_adapters()
|
||||
output = model.generate(**inputs)
|
||||
```
|
||||
|
||||
So deaktivieren Sie das Adaptermodul:
|
||||
|
||||
```py
|
||||
model.disable_adapters()
|
||||
output = model.generate(**inputs)
|
||||
```
|
||||
|
||||
## PEFT-Adapter trainieren
|
||||
|
||||
PEFT-Adapter werden von der Klasse [`Trainer`] unterst├╝tzt, so dass Sie einen Adapter f├╝r Ihren speziellen Anwendungsfall trainieren k├╢nnen. Dazu m├╝ssen Sie nur ein paar weitere Codezeilen hinzuf├╝gen. Zum Beispiel, um einen LoRA-Adapter zu trainieren:
|
||||
|
||||
<Tip>
|
||||
|
||||
Wenn Sie mit der Feinabstimmung eines Modells mit [`Trainer`] noch nicht vertraut sind, werfen Sie einen Blick auf das Tutorial [Feinabstimmung eines vortrainierten Modells](Training).
|
||||
|
||||
</Tip>
|
||||
|
||||
1. Definieren Sie Ihre Adapterkonfiguration mit dem Aufgabentyp und den Hyperparametern (siehe [`~peft.LoraConfig`] f├╝r weitere Details dar├╝ber, was die Hyperparameter tun).
|
||||
|
||||
```py
|
||||
from peft import LoraConfig
|
||||
|
||||
peft_config = LoraConfig(
|
||||
lora_alpha=16,
|
||||
lora_dropout=0.1,
|
||||
r=64,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM",
|
||||
)
|
||||
```
|
||||
|
||||
2. F├╝gen Sie dem Modell einen Adapter hinzu.
|
||||
|
||||
```py
|
||||
model.add_adapter(peft_config)
|
||||
```
|
||||
|
||||
3. Jetzt k├╢nnen Sie das Modell an [`Trainer`] ├╝bergeben!
|
||||
|
||||
```py
|
||||
trainer = Trainer(model=model, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
So speichern Sie Ihren trainierten Adapter und laden ihn wieder:
|
||||
|
||||
```py
|
||||
model.save_pretrained(save_dir)
|
||||
model = AutoModelForCausalLM.from_pretrained(save_dir)
|
||||
```
|
||||
|
||||
<!--
|
||||
TODO: (@younesbelkada @stevhliu)
|
||||
- Link to PEFT docs for further details
|
||||
- Trainer
|
||||
- 8-bit / 4-bit examples ?
|
||||
-->
|
||||
199
docs/source/de/pr_checks.md
Normal file
199
docs/source/de/pr_checks.md
Normal file
@ -0,0 +1,199 @@
|
||||
<!---
|
||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# ├Ьberpr├╝fungen bei einer Pull-Anfrage
|
||||
|
||||
Wenn Sie eine Pull-Anfrage f├╝r ЁЯдЧ Transformers ├╢ffnen, wird eine ganze Reihe von Pr├╝fungen durchgef├╝hrt, um sicherzustellen, dass der Patch, den Sie hinzuf├╝gen, nichts Bestehendes zerst├╢rt. Es gibt vier Arten von Pr├╝fungen:
|
||||
- regul├дre Tests
|
||||
- Erstellung der Dokumentation
|
||||
- Stil von Code und Dokumentation
|
||||
- allgemeine Konsistenz des Repository
|
||||
|
||||
In diesem Dokument werden wir versuchen zu erkl├дren, worum es sich bei diesen verschiedenen Pr├╝fungen handelt und wie Sie sie lokal debuggen k├╢nnen, wenn eine der Pr├╝fungen in Ihrer PR fehlschl├дgt.
|
||||
|
||||
Beachten Sie, dass Sie im Idealfall eine Dev-Installation ben├╢tigen:
|
||||
|
||||
```bash
|
||||
pip install transformers[dev]
|
||||
```
|
||||
|
||||
oder f├╝r eine bearbeitbare Installation:
|
||||
|
||||
```bash
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
innerhalb des Transformers Repo. Da die Anzahl der optionalen Abh├дngigkeiten von Transformers stark zugenommen hat, ist es m├╢glich, dass Sie nicht alle davon bekommen k├╢nnen. Wenn die Dev-Installation fehlschl├дgt, stellen Sie sicher, dass Sie das Deep Learning-Framework, mit dem Sie arbeiten, installieren (PyTorch, TensorFlow und/oder Flax).
|
||||
|
||||
```bash
|
||||
pip install transformers[quality]
|
||||
```
|
||||
|
||||
oder f├╝r eine bearbeitbare Installation:
|
||||
|
||||
```bash
|
||||
pip install -e .[quality]
|
||||
```
|
||||
|
||||
|
||||
## Tests
|
||||
|
||||
Alle Jobs, die mit `ci/circleci: run_tests_` beginnen, f├╝hren Teile der Transformers-Testsuite aus. Jeder dieser Jobs konzentriert sich auf einen Teil der Bibliothek in einer bestimmten Umgebung: `ci/circleci: run_tests_pipelines_tf` zum Beispiel f├╝hrt den Pipelines-Test in einer Umgebung aus, in der nur TensorFlow installiert ist.
|
||||
|
||||
Beachten Sie, dass nur ein Teil der Testsuite jedes Mal ausgef├╝hrt wird, um zu vermeiden, dass Tests ausgef├╝hrt werden, wenn es keine wirkliche ├Дnderung in den Modulen gibt, die sie testen: ein Dienstprogramm wird ausgef├╝hrt, um die Unterschiede in der Bibliothek zwischen vor und nach dem PR zu ermitteln (was GitHub Ihnen auf der Registerkarte "Files changes" anzeigt) und die Tests auszuw├дhlen, die von diesem Unterschied betroffen sind. Dieses Dienstprogramm kann lokal mit ausgef├╝hrt werden:
|
||||
|
||||
```bash
|
||||
python utils/tests_fetcher.py
|
||||
```
|
||||
|
||||
aus dem Stammverzeichnis des Transformers-Repositoriums. Es wird:
|
||||
|
||||
1. ├Ьberpr├╝fen Sie f├╝r jede Datei im Diff, ob die ├Дnderungen im Code oder nur in Kommentaren oder Docstrings enthalten sind. Nur die Dateien mit echten Code├дnderungen werden beibehalten.
|
||||
2. Erstellen Sie eine interne Map, die f├╝r jede Datei des Quellcodes der Bibliothek alle Dateien angibt, auf die sie rekursiv Einfluss nimmt. Von Modul A wird gesagt, dass es sich auf Modul B auswirkt, wenn Modul B Modul A importiert. F├╝r die rekursive Auswirkung ben├╢tigen wir eine Kette von Modulen, die von Modul A zu Modul B f├╝hrt und in der jedes Modul das vorherige importiert.
|
||||
3. Wenden Sie diese Zuordnung auf die in Schritt 1 gesammelten Dateien an. So erhalten wir die Liste der Modelldateien, die von der PR betroffen sind.
|
||||
4. Ordnen Sie jede dieser Dateien der/den entsprechenden Testdatei(en) zu und erhalten Sie die Liste der auszuf├╝hrenden Tests.
|
||||
|
||||
Wenn Sie das Skript lokal ausf├╝hren, sollten Sie die Ergebnisse von Schritt 1, 3 und 4 ausgegeben bekommen und somit wissen, welche Tests ausgef├╝hrt werden. Das Skript erstellt au├Яerdem eine Datei namens `test_list.txt`, die die Liste der auszuf├╝hrenden Tests enth├дlt, die Sie mit dem folgenden Befehl lokal ausf├╝hren k├╢nnen:
|
||||
|
||||
```bash
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
|
||||
```
|
||||
|
||||
F├╝r den Fall, dass Ihnen etwas entgangen ist, wird die komplette Testreihe ebenfalls t├дglich ausgef├╝hrt.
|
||||
|
||||
## Dokumentation erstellen
|
||||
|
||||
Der Job `build_pr_documentation` erstellt und generiert eine Vorschau der Dokumentation, um sicherzustellen, dass alles in Ordnung ist, wenn Ihr PR zusammengef├╝hrt wird. Ein Bot f├╝gt einen Link zur Vorschau der Dokumentation zu Ihrem PR hinzu. Alle ├Дnderungen, die Sie an dem PR vornehmen, werden automatisch in der Vorschau aktualisiert. Wenn die Dokumentation nicht erstellt werden kann, klicken Sie auf **Details** neben dem fehlgeschlagenen Auftrag, um zu sehen, wo der Fehler liegt. Oft ist der Fehler so einfach wie eine fehlende Datei im `toctree`.
|
||||
|
||||
Wenn Sie daran interessiert sind, die Dokumentation lokal zu erstellen oder in der Vorschau anzusehen, werfen Sie einen Blick in die [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) im Ordner docs.
|
||||
|
||||
## Code und Dokumentationsstil
|
||||
|
||||
Die Formatierung des Codes erfolgt f├╝r alle Quelldateien, die Beispiele und die Tests mit `black` und `ruff`. Wir haben auch ein benutzerdefiniertes Tool, das sich um die Formatierung von docstrings und `rst`-Dateien k├╝mmert (`utils/style_doc.py`), sowie um die Reihenfolge der Lazy-Importe, die in den Transformers `__init__.py`-Dateien durchgef├╝hrt werden (`utils/custom_init_isort.py`). All dies k├╢nnen Sie starten, indem Sie Folgendes ausf├╝hren
|
||||
|
||||
```bash
|
||||
make style
|
||||
```
|
||||
|
||||
Das CI pr├╝ft, ob diese innerhalb der Pr├╝fung `ci/circleci: check_code_quality` angewendet wurden. Es f├╝hrt auch `ruff` aus, das einen grundlegenden Blick auf Ihren Code wirft und sich beschwert, wenn es eine undefinierte Variable findet oder eine, die nicht verwendet wird. Um diese Pr├╝fung lokal auszuf├╝hren, verwenden Sie
|
||||
|
||||
```bash
|
||||
make quality
|
||||
```
|
||||
|
||||
Dies kann sehr viel Zeit in Anspruch nehmen. Um dasselbe nur f├╝r die Dateien zu tun, die Sie im aktuellen Zweig ge├дndert haben, f├╝hren Sie
|
||||
|
||||
```bash
|
||||
make fixup
|
||||
```
|
||||
|
||||
Dieser letzte Befehl f├╝hrt auch alle zus├дtzlichen Pr├╝fungen f├╝r die Konsistenz des Repositorys durch. Schauen wir uns diese an.
|
||||
|
||||
## Repository-Konsistenz
|
||||
|
||||
Dies fasst alle Tests zusammen, die sicherstellen, dass Ihr PR das Repository in einem guten Zustand verl├дsst. Sie k├╢nnen diese Pr├╝fung lokal durchf├╝hren, indem Sie Folgendes ausf├╝hren:
|
||||
|
||||
```bash
|
||||
make repo-consistency
|
||||
```
|
||||
|
||||
Dies ├╝berpr├╝ft, ob:
|
||||
|
||||
- Alle zum Init hinzugef├╝gten Objekte sind dokumentiert (ausgef├╝hrt von `utils/check_repo.py`)
|
||||
- Alle `__init__.py`-Dateien haben in ihren beiden Abschnitten den gleichen Inhalt (ausgef├╝hrt von `utils/check_inits.py`)
|
||||
- Der gesamte Code, der als Kopie eines anderen Moduls identifiziert wurde, stimmt mit dem Original ├╝berein (ausgef├╝hrt von `utils/check_copies.py`)
|
||||
- Alle Konfigurationsklassen haben mindestens einen g├╝ltigen Pr├╝fpunkt, der in ihren Dokumentationen erw├дhnt wird (ausgef├╝hrt von `utils/check_config_docstrings.py`)
|
||||
- Alle Konfigurationsklassen enthalten nur Attribute, die in den entsprechenden Modellierungsdateien verwendet werden (ausgef├╝hrt von `utils/check_config_attributes.py`)
|
||||
- Die ├Ьbersetzungen der READMEs und der Index des Dokuments haben die gleiche Modellliste wie die Haupt-README (durchgef├╝hrt von `utils/check_copies.py`)
|
||||
- Die automatisch generierten Tabellen in der Dokumentation sind auf dem neuesten Stand (ausgef├╝hrt von `utils/check_table.py`)
|
||||
- Die Bibliothek verf├╝gt ├╝ber alle Objekte, auch wenn nicht alle optionalen Abh├дngigkeiten installiert sind (ausgef├╝hrt von `utils/check_dummies.py`)
|
||||
|
||||
Sollte diese Pr├╝fung fehlschlagen, m├╝ssen die ersten beiden Punkte manuell korrigiert werden, die letzten vier k├╢nnen automatisch f├╝r Sie korrigiert werden, indem Sie den Befehl
|
||||
|
||||
```bash
|
||||
make fix-copies
|
||||
```
|
||||
|
||||
Zus├дtzliche Pr├╝fungen betreffen PRs, die neue Modelle hinzuf├╝gen, vor allem, dass:
|
||||
|
||||
- Alle hinzugef├╝gten Modelle befinden sich in einer Auto-Zuordnung (durchgef├╝hrt von `utils/check_repo.py`)
|
||||
<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
|
||||
- Alle Modelle werden ordnungsgem├д├Я getestet (ausgef├╝hrt von `utils/check_repo.py`)
|
||||
|
||||
<!-- TODO Sylvain, add the following
|
||||
- All models are added to the main README, inside the main doc
|
||||
- All checkpoints used actually exist on the Hub
|
||||
|
||||
-->
|
||||
|
||||
### Kopien pr├╝fen
|
||||
|
||||
Da die Transformers-Bibliothek in Bezug auf den Modellcode sehr eigenwillig ist und jedes Modell vollst├дndig in einer einzigen Datei implementiert sein sollte, ohne sich auf andere Modelle zu st├╝tzen, haben wir einen Mechanismus hinzugef├╝gt, der ├╝berpr├╝ft, ob eine Kopie des Codes einer Ebene eines bestimmten Modells mit dem Original ├╝bereinstimmt. Auf diese Weise k├╢nnen wir bei einer Fehlerbehebung alle anderen betroffenen Modelle sehen und entscheiden, ob wir die ├Дnderung weitergeben oder die Kopie zerst├╢ren.
|
||||
|
||||
<Tip>
|
||||
|
||||
Wenn eine Datei eine vollst├дndige Kopie einer anderen Datei ist, sollten Sie sie in der Konstante `FULL_COPIES` von `utils/check_copies.py` registrieren.
|
||||
|
||||
</Tip>
|
||||
|
||||
Dieser Mechanismus st├╝tzt sich auf Kommentare der Form `# Kopiert von xxx`. Das `xxx` sollte den gesamten Pfad zu der Klasse der Funktion enthalten, die darunter kopiert wird. Zum Beispiel ist `RobertaSelfOutput` eine direkte Kopie der Klasse `BertSelfOutput`. Sie k├╢nnen also [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289) sehen, dass sie einen Kommentar hat:
|
||||
|
||||
```py
|
||||
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
|
||||
```
|
||||
|
||||
Beachten Sie, dass Sie dies nicht auf eine ganze Klasse anwenden, sondern auf die entsprechenden Methoden, von denen kopiert wird. Zum Beispiel [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598) k├╢nnen Sie sehen, wie `RobertaPreTrainedModel._init_weights` von der gleichen Methode in `BertPreTrainedModel` mit dem Kommentar kopiert wird:
|
||||
|
||||
```py
|
||||
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
|
||||
```
|
||||
|
||||
Manchmal ist die Kopie bis auf die Namen genau gleich: zum Beispiel verwenden wir in `RobertaAttention` `RobertaSelfAttention` anstelle von `BertSelfAttention`, aber ansonsten ist der Code genau derselbe. Aus diesem Grund unterst├╝tzt `#Copied from` einfache String-Ersetzungen mit der folgenden Syntax: `Kopiert von xxx mit foo->bar`. Das bedeutet, dass der Code kopiert wird, wobei alle Instanzen von "foo" durch "bar" ersetzt werden. Sie k├╢nnen sehen, wie es [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` mit dem Kommentar verwendet wird:
|
||||
|
||||
```py
|
||||
# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
|
||||
```
|
||||
|
||||
Beachten Sie, dass um den Pfeil herum keine Leerzeichen stehen sollten (es sei denn, das Leerzeichen ist Teil des zu ersetzenden Musters, nat├╝rlich).
|
||||
|
||||
Sie k├╢nnen mehrere Muster durch ein Komma getrennt hinzuf├╝gen. Zum Beispiel ist hier `CamemberForMaskedLM` eine direkte Kopie von `RobertaForMaskedLM` mit zwei Ersetzungen: `Roberta` zu `Camembert` und `ROBERTA` zu `CAMEMBERT`. Sie k├╢nnen [hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929) sehen, wie dies mit dem Kommentar gemacht wird:
|
||||
|
||||
```py
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
|
||||
```
|
||||
|
||||
Wenn die Reihenfolge eine Rolle spielt (weil eine der Ersetzungen mit einer vorherigen in Konflikt geraten k├╢nnte), werden die Ersetzungen von links nach rechts ausgef├╝hrt.
|
||||
|
||||
<Tip>
|
||||
|
||||
Wenn die Ersetzungen die Formatierung ├дndern (wenn Sie z.B. einen kurzen Namen durch einen sehr langen Namen ersetzen), wird die Kopie nach Anwendung des automatischen Formats ├╝berpr├╝ft.
|
||||
|
||||
</Tip>
|
||||
|
||||
Eine andere M├╢glichkeit, wenn es sich bei den Mustern nur um verschiedene Umschreibungen derselben Ersetzung handelt (mit einer gro├Я- und einer kleingeschriebenen Variante), besteht darin, die Option `all-casing` hinzuzuf├╝gen. [Hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237) ist ein Beispiel in `MobileBertForSequenceClassification` mit dem Kommentar:
|
||||
|
||||
```py
|
||||
# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing
|
||||
```
|
||||
|
||||
In diesem Fall wird der Code von `BertForSequenceClassification` kopiert, indem er ersetzt wird:
|
||||
- `Bert` durch `MobileBert` (zum Beispiel bei der Verwendung von `MobileBertModel` in der Init)
|
||||
- `bert` durch `mobilebert` (zum Beispiel bei der Definition von `self.mobilebert`)
|
||||
- `BERT` durch `MOBILEBERT` (in der Konstante `MOBILEBERT_INPUTS_DOCSTRING`)
|
||||
351
docs/source/de/run_scripts.md
Normal file
351
docs/source/de/run_scripts.md
Normal file
@ -0,0 +1,351 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Trainieren mit einem Skript
|
||||
|
||||
Neben den ЁЯдЧ Transformers [notebooks](./noteboks/README) gibt es auch Beispielskripte, die zeigen, wie man ein Modell f├╝r eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert.
|
||||
|
||||
Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die gr├╢├Яtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von ЁЯдЧ Transformers, die h├╢chstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.
|
||||
|
||||
Es wird nicht erwartet, dass die Beispielskripte bei jedem Problem sofort funktionieren. M├╢glicherweise m├╝ssen Sie das Skript an das Problem anpassen, das Sie zu l├╢sen versuchen. Um Ihnen dabei zu helfen, legen die meisten Skripte vollst├дndig offen, wie die Daten vorverarbeitet werden, so dass Sie sie nach Bedarf f├╝r Ihren Anwendungsfall bearbeiten k├╢nnen.
|
||||
|
||||
F├╝r jede Funktion, die Sie in einem Beispielskript implementieren m├╢chten, diskutieren Sie bitte im [Forum] (https://discuss.huggingface.co/) oder in einem [issue] (https://github.com/huggingface/transformers/issues), bevor Sie einen Pull Request einreichen. Wir freuen uns zwar ├╝ber Fehlerkorrekturen, aber es ist unwahrscheinlich, dass wir einen Pull Request zusammenf├╝hren, der mehr Funktionalit├дt auf Kosten der Lesbarkeit hinzuf├╝gt.
|
||||
|
||||
Diese Anleitung zeigt Ihnen, wie Sie ein Beispiel f├╝r ein Trainingsskript zur Zusammenfassung in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) und [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) ausf├╝hren k├╢nnen. Sofern nicht anders angegeben, sollten alle Beispiele mit beiden Frameworks funktionieren.
|
||||
|
||||
## Einrichtung
|
||||
|
||||
Um die neueste Version der Beispielskripte erfolgreich auszuf├╝hren, **m├╝ssen Sie ЁЯдЧ Transformers aus dem Quellcode** in einer neuen virtuellen Umgebung installieren:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
pip install .
|
||||
```
|
||||
|
||||
F├╝r ├дltere Versionen der Beispielskripte klicken Sie auf die Umschalttaste unten:
|
||||
|
||||
<details>
|
||||
<summary>Beispiele f├╝r ├дltere Versionen von ЁЯдЧ Transformers</summary>
|
||||
<ul>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
|
||||
<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
|
||||
</ul>
|
||||
</details>
|
||||
|
||||
Dann stellen Sie Ihren aktuellen Klon von ЁЯдЧ Transformers auf eine bestimmte Version um, z.B. v3.5.1:
|
||||
|
||||
```bash
|
||||
git checkout tags/v3.5.1
|
||||
```
|
||||
|
||||
Nachdem Sie die richtige Bibliotheksversion eingerichtet haben, navigieren Sie zu dem Beispielordner Ihrer Wahl und installieren die beispielspezifischen Anforderungen:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Ein Skript ausf├╝hren
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Das Beispielskript l├дdt einen Datensatz aus der ЁЯдЧ [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Dann nimmt das Skript eine Feinabstimmung eines Datensatzes mit dem [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) auf einer Architektur vor, die eine Zusammenfassung unterst├╝tzt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem Datensatz [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) durchgef├╝hrt wird. Das T5-Modell ben├╢tigt aufgrund der Art und Weise, wie es trainiert wurde, ein zus├дtzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung wei├Я T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
|
||||
|
||||
```bash
|
||||
python examples/pytorch/summarization/run_summarization.py \
|
||||
--model_name_or_path t5-small \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--overwrite_output_dir \
|
||||
--predict_with_generate
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
Das Beispielskript l├дdt einen Datensatz aus der ЁЯдЧ [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Anschlie├Яend nimmt das Skript die Feinabstimmung eines Datensatzes mit Keras auf einer Architektur vor, die die Zusammenfassung unterst├╝tzt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/t5-small) auf dem [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) Datensatz durchgef├╝hrt wird. Das T5-Modell ben├╢tigt aufgrund der Art und Weise, wie es trainiert wurde, ein zus├дtzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung wei├Я T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
|
||||
|
||||
```bash
|
||||
python examples/tensorflow/summarization/run_summarization.py \
|
||||
--model_name_or_path t5-small \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size 8 \
|
||||
--per_device_eval_batch_size 16 \
|
||||
--num_train_epochs 3 \
|
||||
--do_train \
|
||||
--do_eval
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## Verteiltes Training und gemischte Pr├дzision
|
||||
|
||||
Der [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) unterst├╝tzt verteiltes Training und gemischte Pr├дzision, d.h. Sie k├╢nnen ihn auch in einem Skript verwenden. So aktivieren Sie diese beiden Funktionen:
|
||||
|
||||
- F├╝gen Sie das Argument `fp16` hinzu, um gemischte Genauigkeit zu aktivieren.
|
||||
- Legen Sie die Anzahl der zu verwendenden GPUs mit dem Argument `nproc_per_node` fest.
|
||||
|
||||
```bash
|
||||
python -m torch.distributed.launch \
|
||||
--nproc_per_node 8 pytorch/summarization/run_summarization.py \
|
||||
--fp16 \
|
||||
--model_name_or_path t5-small \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--overwrite_output_dir \
|
||||
--predict_with_generate
|
||||
```
|
||||
|
||||
TensorFlow-Skripte verwenden eine [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) f├╝r verteiltes Training, und Sie m├╝ssen dem Trainingsskript keine zus├дtzlichen Argumente hinzuf├╝gen. Das TensorFlow-Skript verwendet standardm├д├Яig mehrere GPUs, wenn diese verf├╝gbar sind.
|
||||
|
||||
## Ein Skript auf einer TPU ausf├╝hren
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Tensor Processing Units (TPUs) sind speziell f├╝r die Beschleunigung der Leistung konzipiert. PyTorch unterst├╝tzt TPUs mit dem [XLA](https://www.tensorflow.org/xla) Deep Learning Compiler (siehe [hier](https://github.com/pytorch/xla/blob/master/README.md) f├╝r weitere Details). Um eine TPU zu verwenden, starten Sie das Skript `xla_spawn.py` und verwenden das Argument `num_cores`, um die Anzahl der TPU-Kerne festzulegen, die Sie verwenden m├╢chten.
|
||||
|
||||
```bash
|
||||
python xla_spawn.py --num_cores 8 \
|
||||
summarization/run_summarization.py \
|
||||
--model_name_or_path t5-small \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--overwrite_output_dir \
|
||||
--predict_with_generate
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
Tensor Processing Units (TPUs) sind speziell f├╝r die Beschleunigung der Leistung konzipiert. TensorFlow Skripte verwenden eine [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) f├╝r das Training auf TPUs. Um eine TPU zu verwenden, ├╝bergeben Sie den Namen der TPU-Ressource an das Argument `tpu`.
|
||||
|
||||
```bash
|
||||
python run_summarization.py \
|
||||
--tpu name_of_tpu_resource \
|
||||
--model_name_or_path t5-small \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size 8 \
|
||||
--per_device_eval_batch_size 16 \
|
||||
--num_train_epochs 3 \
|
||||
--do_train \
|
||||
--do_eval
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## F├╝hren Sie ein Skript mit ЁЯдЧ Accelerate aus.
|
||||
|
||||
ЁЯдЧ [Accelerate](https://huggingface.co/docs/accelerate) ist eine reine PyTorch-Bibliothek, die eine einheitliche Methode f├╝r das Training eines Modells auf verschiedenen Arten von Setups (nur CPU, mehrere GPUs, TPUs) bietet und dabei die vollst├дndige Transparenz der PyTorch-Trainingsschleife beibeh├дlt. Stellen Sie sicher, dass Sie ЁЯдЧ Accelerate installiert haben, wenn Sie es nicht bereits haben:
|
||||
|
||||
> Hinweis: Da Accelerate schnell weiterentwickelt wird, muss die Git-Version von Accelerate installiert sein, um die Skripte auszuf├╝hren.
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/accelerate
|
||||
```
|
||||
|
||||
Anstelle des Skripts `run_summarization.py` m├╝ssen Sie das Skript `run_summarization_no_trainer.py` verwenden. Die von Accelerate unterst├╝tzten Skripte haben eine Datei `task_no_trainer.py` im Ordner. Beginnen Sie mit dem folgenden Befehl, um eine Konfigurationsdatei zu erstellen und zu speichern:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
Testen Sie Ihre Einrichtung, um sicherzustellen, dass sie korrekt konfiguriert ist:
|
||||
|
||||
```bash
|
||||
accelerate test
|
||||
```
|
||||
|
||||
Jetzt sind Sie bereit, das Training zu starten:
|
||||
|
||||
```bash
|
||||
accelerate launch run_summarization_no_trainer.py \
|
||||
--model_name_or_path t5-small \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir ~/tmp/tst-summarization
|
||||
```
|
||||
|
||||
## Verwenden Sie einen benutzerdefinierten Datensatz
|
||||
|
||||
Das Verdichtungsskript unterst├╝tzt benutzerdefinierte Datens├дtze, solange es sich um eine CSV- oder JSON-Line-Datei handelt. Wenn Sie Ihren eigenen Datensatz verwenden, m├╝ssen Sie mehrere zus├дtzliche Argumente angeben:
|
||||
|
||||
- `train_file` und `validation_file` geben den Pfad zu Ihren Trainings- und Validierungsdateien an.
|
||||
- text_column` ist der Eingabetext, der zusammengefasst werden soll.
|
||||
- Summary_column" ist der auszugebende Zieltext.
|
||||
|
||||
Ein Zusammenfassungsskript, das einen benutzerdefinierten Datensatz verwendet, w├╝rde wie folgt aussehen:
|
||||
|
||||
```bash
|
||||
python examples/pytorch/summarization/run_summarization.py \
|
||||
--model_name_or_path t5-small \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--train_file path_to_csv_or_jsonlines_file \
|
||||
--validation_file path_to_csv_or_jsonlines_file \
|
||||
--text_column text_column_name \
|
||||
--summary_column summary_column_name \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--overwrite_output_dir \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--predict_with_generate
|
||||
```
|
||||
|
||||
## Testen Sie ein Skript
|
||||
|
||||
Es ist oft eine gute Idee, Ihr Skript an einer kleineren Anzahl von Beispielen f├╝r Datens├дtze auszuf├╝hren, um sicherzustellen, dass alles wie erwartet funktioniert, bevor Sie sich auf einen ganzen Datensatz festlegen, dessen Fertigstellung Stunden dauern kann. Verwenden Sie die folgenden Argumente, um den Datensatz auf eine maximale Anzahl von Stichproben zu beschr├дnken:
|
||||
|
||||
- `max_train_samples`
|
||||
- `max_eval_samples`
|
||||
- `max_predict_samples`
|
||||
|
||||
```bash
|
||||
python examples/pytorch/summarization/run_summarization.py \
|
||||
--model_name_or_path t5-small \
|
||||
--max_train_samples 50 \
|
||||
--max_eval_samples 50 \
|
||||
--max_predict_samples 50 \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--overwrite_output_dir \
|
||||
--predict_with_generate
|
||||
```
|
||||
|
||||
Nicht alle Beispielskripte unterst├╝tzen das Argument `max_predict_samples`. Wenn Sie sich nicht sicher sind, ob Ihr Skript dieses Argument unterst├╝tzt, f├╝gen Sie das Argument `-h` hinzu, um dies zu ├╝berpr├╝fen:
|
||||
|
||||
```bash
|
||||
examples/pytorch/summarization/run_summarization.py -h
|
||||
```
|
||||
|
||||
## Training vom Kontrollpunkt fortsetzen
|
||||
|
||||
Eine weitere hilfreiche Option, die Sie aktivieren k├╢nnen, ist die Wiederaufnahme des Trainings von einem fr├╝heren Kontrollpunkt aus. Auf diese Weise k├╢nnen Sie im Falle einer Unterbrechung Ihres Trainings dort weitermachen, wo Sie aufgeh├╢rt haben, ohne von vorne beginnen zu m├╝ssen. Es gibt zwei Methoden, um das Training von einem Kontrollpunkt aus wieder aufzunehmen.
|
||||
|
||||
Die erste Methode verwendet das Argument `output_dir previous_output_dir`, um das Training ab dem letzten in `output_dir` gespeicherten Kontrollpunkt wieder aufzunehmen. In diesem Fall sollten Sie `overwrite_output_dir` entfernen:
|
||||
|
||||
```bash
|
||||
python examples/pytorch/summarization/run_summarization.py
|
||||
--model_name_or_path t5-small \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--output_dir previous_output_dir \
|
||||
--predict_with_generate
|
||||
```
|
||||
|
||||
Die zweite Methode verwendet das Argument `Resume_from_checkpoint path_to_specific_checkpoint`, um das Training ab einem bestimmten Checkpoint-Ordner wieder aufzunehmen.
|
||||
|
||||
```bash
|
||||
python examples/pytorch/summarization/run_summarization.py
|
||||
--model_name_or_path t5-small \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--overwrite_output_dir \
|
||||
--resume_from_checkpoint path_to_specific_checkpoint \
|
||||
--predict_with_generate
|
||||
```
|
||||
|
||||
## Teilen Sie Ihr Modell
|
||||
|
||||
Alle Skripte k├╢nnen Ihr endg├╝ltiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
Dann f├╝gen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
|
||||
|
||||
Wenn Sie Ihrem Repository einen bestimmten Namen geben m├╢chten, f├╝gen Sie ihn mit dem Argument `push_to_hub_model_id` hinzu. Das Repository wird automatisch unter Ihrem Namensraum aufgef├╝hrt.
|
||||
|
||||
Das folgende Beispiel zeigt, wie Sie ein Modell mit einem bestimmten Repository-Namen hochladen k├╢nnen:
|
||||
|
||||
```bash
|
||||
python examples/pytorch/summarization/run_summarization.py
|
||||
--model_name_or_path t5-small \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--dataset_name cnn_dailymail \
|
||||
--dataset_config "3.0.0" \
|
||||
--source_prefix "summarize: " \
|
||||
--push_to_hub \
|
||||
--push_to_hub_model_id finetuned-t5-cnn_dailymail \
|
||||
--output_dir /tmp/tst-summarization \
|
||||
--per_device_train_batch_size=4 \
|
||||
--per_device_eval_batch_size=4 \
|
||||
--overwrite_output_dir \
|
||||
--predict_with_generate
|
||||
```
|
||||
1293
docs/source/de/testing.md
Normal file
1293
docs/source/de/testing.md
Normal file
File diff suppressed because it is too large
Load Diff
323
docs/source/de/transformers_agents.md
Normal file
323
docs/source/de/transformers_agents.md
Normal file
@ -0,0 +1,323 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Transformers Agents
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Transformers Agents ist eine experimentelle API, die jederzeit ge├дndert werden kann. Die von den Agenten zur├╝ckgegebenen Ergebnisse
|
||||
zur├╝ckgegeben werden, k├╢nnen variieren, da sich die APIs oder die zugrunde liegenden Modelle ├дndern k├╢nnen.
|
||||
|
||||
</Tip>
|
||||
|
||||
Transformers Version v4.29.0, die auf dem Konzept von *Tools* und *Agenten* aufbaut. Sie k├╢nnen damit spielen in
|
||||
[dieses Colab](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj).
|
||||
|
||||
Kurz gesagt, es bietet eine API f├╝r nat├╝rliche Sprache auf der Grundlage von Transformers: Wir definieren eine Reihe von kuratierten Tools und entwerfen einen
|
||||
Agenten, um nat├╝rliche Sprache zu interpretieren und diese Werkzeuge zu verwenden. Es ist von vornherein erweiterbar; wir haben einige relevante Tools kuratiert,
|
||||
aber wir werden Ihnen zeigen, wie das System einfach erweitert werden kann, um jedes von der Community entwickelte Tool zu verwenden.
|
||||
|
||||
Beginnen wir mit einigen Beispielen daf├╝r, was mit dieser neuen API erreicht werden kann. Sie ist besonders leistungsf├дhig, wenn es um
|
||||
Sie ist besonders leistungsstark, wenn es um multimodale Aufgaben geht. Lassen Sie uns also eine Runde drehen, um Bilder zu erzeugen und Text vorzulesen.
|
||||
|
||||
```py
|
||||
agent.run("Caption the following image", image=image)
|
||||
```
|
||||
|
||||
| **Input** | **Output** |
|
||||
|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
|
||||
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
|
||||
|
||||
---
|
||||
|
||||
```py
|
||||
agent.run("Read the following text out loud", text=text)
|
||||
```
|
||||
| **Input** | **Output** |
|
||||
|-------------------------------------------------------------------------------------------------------------------------|----------------------------------------------|
|
||||
| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio>
|
||||
|
||||
---
|
||||
|
||||
```py
|
||||
agent.run(
|
||||
"In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
|
||||
document=document,
|
||||
)
|
||||
```
|
||||
| **Input** | **Output** |
|
||||
|-----------------------------------------------------------------------------------------------------------------------------|----------------|
|
||||
| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
|
||||
|
||||
## Schnellstart
|
||||
|
||||
Bevor Sie `agent.run` verwenden k├╢nnen, m├╝ssen Sie einen Agenten instanziieren, der ein gro├Яes Sprachmodell (LLM) ist.
|
||||
Wir bieten Unterst├╝tzung f├╝r openAI-Modelle sowie f├╝r OpenSource-Alternativen von BigCode und OpenAssistant. Die openAI
|
||||
Modelle sind leistungsf├дhiger (erfordern aber einen openAI-API-Schl├╝ssel, k├╢nnen also nicht kostenlos verwendet werden); Hugging Face
|
||||
bietet kostenlosen Zugang zu Endpunkten f├╝r BigCode- und OpenAssistant-Modelle.
|
||||
|
||||
To start with, please install the `agents` extras in order to install all default dependencies.
|
||||
```bash
|
||||
pip install transformers[agents]
|
||||
```
|
||||
|
||||
Um openAI-Modelle zu verwenden, instanziieren Sie einen [`OpenAiAgent`], nachdem Sie die `openai`-Abh├дngigkeit installiert haben:
|
||||
|
||||
```bash
|
||||
pip install openai
|
||||
```
|
||||
|
||||
|
||||
```py
|
||||
from transformers import OpenAiAgent
|
||||
|
||||
agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
|
||||
```
|
||||
|
||||
Um BigCode oder OpenAssistant zu verwenden, melden Sie sich zun├дchst an, um Zugriff auf die Inference API zu erhalten:
|
||||
|
||||
```py
|
||||
from huggingface_hub import login
|
||||
|
||||
login("<YOUR_TOKEN>")
|
||||
```
|
||||
|
||||
Dann instanziieren Sie den Agenten
|
||||
|
||||
```py
|
||||
from transformers import HfAgent
|
||||
|
||||
# Starcoder
|
||||
agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
|
||||
# StarcoderBase
|
||||
# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
|
||||
# OpenAssistant
|
||||
# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
|
||||
```
|
||||
|
||||
Dies geschieht mit der Inferenz-API, die Hugging Face derzeit kostenlos zur Verf├╝gung stellt. Wenn Sie Ihren eigenen Inferenz
|
||||
Endpunkt f├╝r dieses Modell (oder einen anderen) haben, k├╢nnen Sie die obige URL durch Ihren URL-Endpunkt ersetzen.
|
||||
|
||||
<Tip>
|
||||
|
||||
StarCoder und OpenAssistant sind kostenlos und leisten bei einfachen Aufgaben bewundernswert gute Arbeit. Allerdings halten die Kontrollpunkte
|
||||
nicht, wenn es um komplexere Aufforderungen geht. Wenn Sie mit einem solchen Problem konfrontiert sind, empfehlen wir Ihnen, das OpenAI
|
||||
Modell auszuprobieren, das zwar leider nicht quelloffen ist, aber zur Zeit eine bessere Leistung erbringt.
|
||||
|
||||
</Tip>
|
||||
|
||||
Sie sind jetzt startklar! Lassen Sie uns in die beiden APIs eintauchen, die Ihnen jetzt zur Verf├╝gung stehen.
|
||||
|
||||
### Einzelne Ausf├╝hrung (run)
|
||||
|
||||
Die Methode der einmaligen Ausf├╝hrung ist die Verwendung der [`~Agent.run`] Methode des Agenten:
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of rivers and lakes.")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
|
||||
|
||||
Es w├дhlt automatisch das (oder die) Werkzeug(e) aus, das (die) f├╝r die von Ihnen gew├╝nschte Aufgabe geeignet ist (sind) und f├╝hrt es (sie) entsprechend aus. Es
|
||||
kann eine oder mehrere Aufgaben in der gleichen Anweisung ausf├╝hren (je komplexer Ihre Anweisung ist, desto wahrscheinlicher ist ein
|
||||
der Agent scheitern).
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of the sea then transform the picture to add an island")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
|
||||
|
||||
<br/>
|
||||
|
||||
|
||||
Jede [`~Agent.run`] Operation ist unabh├дngig, so dass Sie sie mehrmals hintereinander mit unterschiedlichen Aufgaben ausf├╝hren k├╢nnen.
|
||||
|
||||
Beachten Sie, dass Ihr `Agent` nur ein gro├Яsprachiges Modell ist, so dass kleine Variationen in Ihrer Eingabeaufforderung v├╢llig unterschiedliche Ergebnisse liefern k├╢nnen.
|
||||
unterschiedliche Ergebnisse liefern. Es ist wichtig, dass Sie die Aufgabe, die Sie ausf├╝hren m├╢chten, so genau wie m├╢glich erkl├дren. Wir gehen noch weiter ins Detail
|
||||
wie man gute Prompts schreibt [hier](custom_tools#writing-good-user-inputs).
|
||||
|
||||
Wenn Sie einen Status ├╝ber Ausf├╝hrungszeiten hinweg beibehalten oder dem Agenten Nicht-Text-Objekte ├╝bergeben m├╢chten, k├╢nnen Sie dies tun, indem Sie
|
||||
Variablen, die der Agent verwenden soll. Sie k├╢nnten zum Beispiel das erste Bild von Fl├╝ssen und Seen erzeugen,
|
||||
und das Modell bitten, dieses Bild zu aktualisieren und eine Insel hinzuzuf├╝gen, indem Sie Folgendes tun:
|
||||
|
||||
```python
|
||||
picture = agent.run("Generate a picture of rivers and lakes.")
|
||||
updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Dies kann hilfreich sein, wenn das Modell Ihre Anfrage nicht verstehen kann und die Werkzeuge verwechselt. Ein Beispiel w├дre:
|
||||
|
||||
```py
|
||||
agent.run("Draw me the picture of a capybara swimming in the sea")
|
||||
```
|
||||
|
||||
Hier k├╢nnte das Modell auf zwei Arten interpretieren:
|
||||
- Die Funktion `Text-zu-Bild` erzeugt ein Wasserschwein, das im Meer schwimmt.
|
||||
- Oder Sie lassen das `Text-zu-Bild` ein Wasserschwein erzeugen und verwenden dann das Werkzeug `Bildtransformation`, um es im Meer schwimmen zu lassen.
|
||||
|
||||
Falls Sie das erste Szenario erzwingen m├╢chten, k├╢nnen Sie dies tun, indem Sie die Eingabeaufforderung als Argument ├╝bergeben:
|
||||
|
||||
```py
|
||||
agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
### Chat-basierte Ausf├╝hrung (Chat)
|
||||
|
||||
Der Agent verf├╝gt auch ├╝ber einen Chat-basierten Ansatz, der die Methode [`~Agent.chat`] verwendet:
|
||||
|
||||
```py
|
||||
agent.chat("Generate a picture of rivers and lakes")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
|
||||
|
||||
```py
|
||||
agent.chat("Transform the picture so that there is a rock in there")
|
||||
```
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
|
||||
|
||||
<br/>
|
||||
|
||||
Dies ist ein interessanter Ansatz, wenn Sie den Zustand ├╝ber Anweisungen hinweg beibehalten m├╢chten. Er ist besser f├╝r Experimente geeignet,
|
||||
eignet sich aber eher f├╝r einzelne Anweisungen als f├╝r komplexe Anweisungen (die die [`~Agent.run`]
|
||||
Methode besser verarbeiten kann).
|
||||
|
||||
Diese Methode kann auch Argumente entgegennehmen, wenn Sie Nicht-Text-Typen oder bestimmte Aufforderungen ├╝bergeben m├╢chten.
|
||||
|
||||
### тЪая╕П Fernausf├╝hrung
|
||||
|
||||
Zu Demonstrationszwecken und damit es mit allen Setups verwendet werden kann, haben wir Remote-Executors f├╝r mehrere
|
||||
der Standard-Tools erstellt, auf die der Agent in dieser Version Zugriff hat. Diese werden erstellt mit
|
||||
[inference endpoints](https://huggingface.co/inference-endpoints).
|
||||
|
||||
Wir haben diese vorerst deaktiviert, aber um zu sehen, wie Sie selbst Remote Executors Tools einrichten k├╢nnen,
|
||||
empfehlen wir die Lekt├╝re des [custom tool guide](./custom_tools).
|
||||
|
||||
### Was passiert hier? Was sind Tools und was sind Agenten?
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
|
||||
|
||||
#### Agenten
|
||||
|
||||
Der "Agent" ist hier ein gro├Яes Sprachmodell, das wir auffordern, Zugang zu einem bestimmten Satz von Tools zu erhalten.
|
||||
|
||||
LLMs sind ziemlich gut darin, kleine Codeproben zu erzeugen. Diese API macht sich das zunutze, indem sie das
|
||||
LLM ein kleines Codebeispiel gibt, das eine Aufgabe mit einer Reihe von Werkzeugen ausf├╝hrt. Diese Aufforderung wird dann erg├дnzt durch die
|
||||
Aufgabe, die Sie Ihrem Agenten geben, und die Beschreibung der Werkzeuge, die Sie ihm geben. Auf diese Weise erh├дlt er Zugriff auf die Dokumentation der
|
||||
Tools, insbesondere die erwarteten Eingaben und Ausgaben, und kann den entsprechenden Code generieren.
|
||||
|
||||
#### Tools
|
||||
|
||||
Tools sind sehr einfach: Sie bestehen aus einer einzigen Funktion mit einem Namen und einer Beschreibung. Wir verwenden dann die Beschreibungen dieser Tools
|
||||
um den Agenten aufzufordern. Anhand der Eingabeaufforderung zeigen wir dem Agenten, wie er die Tools nutzen kann, um das zu tun, was in der
|
||||
in der Abfrage angefordert wurde.
|
||||
|
||||
Dies geschieht mit brandneuen Tools und nicht mit Pipelines, denn der Agent schreibt besseren Code mit sehr atomaren Tools.
|
||||
Pipelines sind st├дrker refaktorisiert und fassen oft mehrere Aufgaben in einer einzigen zusammen. Tools sind daf├╝r gedacht, sich auf
|
||||
eine einzige, sehr einfache Aufgabe konzentrieren.
|
||||
|
||||
#### Code-Ausf├╝hrung?!
|
||||
|
||||
Dieser Code wird dann mit unserem kleinen Python-Interpreter auf den mit Ihren Tools ├╝bergebenen Eingaben ausgef├╝hrt.
|
||||
Wir h├╢ren Sie schon schreien "Willk├╝rliche Codeausf├╝hrung!", aber lassen Sie uns erkl├дren, warum das nicht der Fall ist.
|
||||
|
||||
Die einzigen Funktionen, die aufgerufen werden k├╢nnen, sind die von Ihnen zur Verf├╝gung gestellten Tools und die Druckfunktion, so dass Sie bereits eingeschr├дnkt sind
|
||||
eingeschr├дnkt, was ausgef├╝hrt werden kann. Sie sollten sicher sein, wenn es sich auf die Werkzeuge f├╝r das Umarmungsgesicht beschr├дnkt.
|
||||
|
||||
Dann lassen wir keine Attributsuche oder Importe zu (die ohnehin nicht ben├╢tigt werden, um die
|
||||
Inputs/Outputs an eine kleine Gruppe von Funktionen), so dass alle offensichtlichen Angriffe (und Sie m├╝ssten den LLM
|
||||
dazu auffordern, sie auszugeben) kein Problem darstellen sollten. Wenn Sie auf Nummer sicher gehen wollen, k├╢nnen Sie die
|
||||
run()-Methode mit dem zus├дtzlichen Argument return_code=True ausf├╝hren. In diesem Fall gibt der Agent nur den auszuf├╝hrenden Code
|
||||
zur Ausf├╝hrung zur├╝ck und Sie k├╢nnen entscheiden, ob Sie ihn ausf├╝hren m├╢chten oder nicht.
|
||||
|
||||
Die Ausf├╝hrung bricht bei jeder Zeile ab, in der versucht wird, eine illegale Operation auszuf├╝hren, oder wenn ein regul├дrer Python-Fehler
|
||||
mit dem vom Agenten generierten Code.
|
||||
|
||||
### Ein kuratierter Satz von Tools
|
||||
|
||||
Wir haben eine Reihe von Tools identifiziert, die solche Agenten unterst├╝tzen k├╢nnen. Hier ist eine aktualisierte Liste der Tools, die wir integriert haben
|
||||
in `transformers` integriert haben:
|
||||
|
||||
- **Beantwortung von Fragen zu Dokumenten**: Beantworten Sie anhand eines Dokuments (z.B. PDF) im Bildformat eine Frage zu diesem Dokument ([Donut](./model_doc/donut))
|
||||
- Beantworten von Textfragen**: Geben Sie einen langen Text und eine Frage an, beantworten Sie die Frage im Text ([Flan-T5](./model_doc/flan-t5))
|
||||
- **Unbedingte Bildunterschriften**: Beschriften Sie das Bild! ([BLIP](./model_doc/blip))
|
||||
- **Bildfragebeantwortung**: Beantworten Sie bei einem Bild eine Frage zu diesem Bild ([VILT](./model_doc/vilt))
|
||||
- **Bildsegmentierung**: Geben Sie ein Bild und einen Prompt an und geben Sie die Segmentierungsmaske dieses Prompts aus ([CLIPSeg](./model_doc/clipseg))
|
||||
- **Sprache in Text**: Geben Sie eine Audioaufnahme einer sprechenden Person an und transkribieren Sie die Sprache in Text ([Whisper](./model_doc/whisper))
|
||||
- **Text in Sprache**: wandelt Text in Sprache um ([SpeechT5](./model_doc/speecht5))
|
||||
- **Zero-Shot-Textklassifizierung**: Ermitteln Sie anhand eines Textes und einer Liste von Bezeichnungen, welcher Bezeichnung der Text am ehesten entspricht ([BART](./model_doc/bart))
|
||||
- **Textzusammenfassung**: fassen Sie einen langen Text in einem oder wenigen S├дtzen zusammen ([BART](./model_doc/bart))
|
||||
- **├Ьbersetzung**: ├Ьbersetzen des Textes in eine bestimmte Sprache ([NLLB](./model_doc/nllb))
|
||||
|
||||
Diese Tools sind in Transformatoren integriert und k├╢nnen auch manuell verwendet werden, zum Beispiel:
|
||||
|
||||
```py
|
||||
from transformers import load_tool
|
||||
|
||||
tool = load_tool("text-to-speech")
|
||||
audio = tool("This is a text to speech tool")
|
||||
```
|
||||
|
||||
### Benutzerdefinierte Tools
|
||||
|
||||
Wir haben zwar eine Reihe von Tools identifiziert, sind aber der festen ├Ьberzeugung, dass der Hauptwert dieser Implementierung darin besteht
|
||||
die M├╢glichkeit, benutzerdefinierte Tools schnell zu erstellen und weiterzugeben.
|
||||
|
||||
Indem Sie den Code eines Tools in einen Hugging Face Space oder ein Modell-Repository stellen, k├╢nnen Sie das Tool
|
||||
direkt mit dem Agenten nutzen. Wir haben ein paar neue Funktionen hinzugef├╝gt
|
||||
**transformers-agnostic** Tools zur [`huggingface-tools` Organisation](https://huggingface.co/huggingface-tools) hinzugef├╝gt:
|
||||
|
||||
- **Text-Downloader**: zum Herunterladen eines Textes von einer Web-URL
|
||||
- **Text zu Bild**: erzeugt ein Bild nach einer Eingabeaufforderung und nutzt dabei stabile Diffusion
|
||||
- **Bildtransformation**: ver├дndert ein Bild anhand eines Ausgangsbildes und einer Eingabeaufforderung, unter Ausnutzung der stabilen pix2pix-Diffusion
|
||||
- **Text zu Video**: Erzeugen eines kleinen Videos nach einer Eingabeaufforderung, unter Verwendung von damo-vilab
|
||||
|
||||
Das Text-zu-Bild-Tool, das wir von Anfang an verwendet haben, ist ein Remote-Tool, das sich in
|
||||
[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)! Wir werden
|
||||
weiterhin solche Tools f├╝r diese und andere Organisationen ver├╢ffentlichen, um diese Implementierung weiter zu verbessern.
|
||||
|
||||
Die Agenten haben standardm├д├Яig Zugriff auf die Tools, die sich auf [*huggingface-tools*](https://huggingface.co/huggingface-tools) befinden.
|
||||
Wie Sie Ihre eigenen Tools schreiben und freigeben k├╢nnen und wie Sie jedes benutzerdefinierte Tool, das sich auf dem Hub befindet, nutzen k├╢nnen, erkl├дren wir in [folgender Anleitung](custom_tools).
|
||||
|
||||
### Code-Erzeugung
|
||||
|
||||
Bisher haben wir gezeigt, wie Sie die Agenten nutzen k├╢nnen, um Aktionen f├╝r Sie durchzuf├╝hren. Der Agent generiert jedoch nur Code
|
||||
den wir dann mit einem sehr eingeschr├дnkten Python-Interpreter ausf├╝hren. Falls Sie den generierten Code in einer anderen Umgebung verwenden m├╢chten
|
||||
einer anderen Umgebung verwenden m├╢chten, k├╢nnen Sie den Agenten auffordern, den Code zusammen mit einer Tooldefinition und genauen Importen zur├╝ckzugeben.
|
||||
|
||||
Zum Beispiel die folgende Anweisung
|
||||
```python
|
||||
agent.run("Draw me a picture of rivers and lakes", return_code=True)
|
||||
```
|
||||
|
||||
gibt den folgenden Code zur├╝ck
|
||||
|
||||
```python
|
||||
from transformers import load_tool
|
||||
|
||||
image_generator = load_tool("huggingface-tools/text-to-image")
|
||||
|
||||
image = image_generator(prompt="rivers and lakes")
|
||||
```
|
||||
|
||||
die Sie dann selbst ├дndern und ausf├╝hren k├╢nnen.
|
||||
3
docs/source/en/_redirects.yml
Normal file
3
docs/source/en/_redirects.yml
Normal file
@ -0,0 +1,3 @@
|
||||
# Optimizing inference
|
||||
|
||||
perf_infer_gpu_many: perf_infer_gpu_one
|
||||
@ -71,6 +71,10 @@
|
||||
title: Zero-shot image classification
|
||||
- local: tasks/monocular_depth_estimation
|
||||
title: Depth estimation
|
||||
- local: tasks/image_to_image
|
||||
title: Image-to-Image
|
||||
- local: tasks/knowledge_distillation_for_image_classification
|
||||
title: Knowledge Distillation for Computer Vision
|
||||
title: Computer Vision
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@ -92,6 +96,8 @@
|
||||
sections:
|
||||
- local: tasks/idefics
|
||||
title: Image tasks with IDEFICS
|
||||
- local: tasks/prompting
|
||||
title: LLM prompting guide
|
||||
title: Prompting
|
||||
title: Task Guides
|
||||
- sections:
|
||||
@ -149,13 +155,9 @@
|
||||
title: Efficient training techniques
|
||||
- sections:
|
||||
- local: perf_infer_cpu
|
||||
title: Inference on CPU
|
||||
title: CPU inference
|
||||
- local: perf_infer_gpu_one
|
||||
title: Inference on one GPU
|
||||
- local: perf_infer_gpu_many
|
||||
title: Inference on many GPUs
|
||||
- local: perf_infer_special
|
||||
title: Inference on Specialized Hardware
|
||||
title: GPU inference
|
||||
title: Optimizing inference
|
||||
- local: big_models
|
||||
title: Instantiating a big model
|
||||
@ -205,6 +207,8 @@
|
||||
title: Pipelines for webserver inference
|
||||
- local: model_memory_anatomy
|
||||
title: Model training anatomy
|
||||
- local: llm_tutorial_optimization
|
||||
title: Getting the most out of LLMs
|
||||
title: Conceptual guides
|
||||
- sections:
|
||||
- sections:
|
||||
@ -334,6 +338,8 @@
|
||||
title: FSMT
|
||||
- local: model_doc/funnel
|
||||
title: Funnel Transformer
|
||||
- local: model_doc/fuyu
|
||||
title: Fuyu
|
||||
- local: model_doc/openai-gpt
|
||||
title: GPT
|
||||
- local: model_doc/gpt_neo
|
||||
@ -384,6 +390,8 @@
|
||||
title: MegatronBERT
|
||||
- local: model_doc/megatron_gpt2
|
||||
title: MegatronGPT2
|
||||
- local: model_doc/mistral
|
||||
title: Mistral
|
||||
- local: model_doc/mluke
|
||||
title: mLUKE
|
||||
- local: model_doc/mobilebert
|
||||
@ -602,6 +610,8 @@
|
||||
title: MusicGen
|
||||
- local: model_doc/pop2piano
|
||||
title: Pop2Piano
|
||||
- local: model_doc/seamless_m4t
|
||||
title: Seamless-M4T
|
||||
- local: model_doc/sew
|
||||
title: SEW
|
||||
- local: model_doc/sew-d
|
||||
@ -669,6 +679,8 @@
|
||||
title: IDEFICS
|
||||
- local: model_doc/instructblip
|
||||
title: InstructBLIP
|
||||
- local: model_doc/kosmos-2
|
||||
title: KOSMOS-2
|
||||
- local: model_doc/layoutlm
|
||||
title: LayoutLM
|
||||
- local: model_doc/layoutlmv2
|
||||
@ -685,10 +697,14 @@
|
||||
title: MatCha
|
||||
- local: model_doc/mgp-str
|
||||
title: MGP-STR
|
||||
- local: model_doc/nougat
|
||||
title: Nougat
|
||||
- local: model_doc/oneformer
|
||||
title: OneFormer
|
||||
- local: model_doc/owlvit
|
||||
title: OWL-ViT
|
||||
- local: model_doc/owlv2
|
||||
title: OWLv2
|
||||
- local: model_doc/perceiver
|
||||
title: Perceiver
|
||||
- local: model_doc/pix2struct
|
||||
|
||||
@ -52,7 +52,7 @@ A good first starting point to better understand the library is to read the [doc
|
||||
|
||||
In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
|
||||
inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
|
||||
person that will use your model, but also everybody that will read, try to understand, and possibly tweak your code.
|
||||
person who will use your model, but also everybody who will read, try to understand, and possibly tweak your code.
|
||||
|
||||
With this in mind, let's go a bit deeper into the general library design.
|
||||
|
||||
@ -131,9 +131,9 @@ From experience, we can tell you that the most important things to keep in mind
|
||||
friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
|
||||
your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
|
||||
is based on XLM.
|
||||
- It's more of an engineering challenge than a scientific challenge. You should spend more time on creating an
|
||||
efficient debugging environment than trying to understand all theoretical aspects of the model in the paper.
|
||||
- Ask for help, when you're stuck! Models are the core component of ЁЯдЧ Transformers so that we at Hugging Face are more
|
||||
- It's more of an engineering challenge than a scientific challenge. You should spend more time creating an
|
||||
efficient debugging environment rather than trying to understand all theoretical aspects of the model in the paper.
|
||||
- Ask for help, when you're stuck! Models are the core component of ЁЯдЧ Transformers so we at Hugging Face are more
|
||||
than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
|
||||
progress.
|
||||
|
||||
@ -157,9 +157,9 @@ List:
|
||||
тШР Submitted the pull request<br>
|
||||
тШР (Optional) Added a demo notebook
|
||||
|
||||
To begin with, we usually recommend to start by getting a good theoretical understanding of `BrandNewBert`. However,
|
||||
To begin with, we usually recommend starting by getting a good theoretical understanding of `BrandNewBert`. However,
|
||||
if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
|
||||
into the `BrandNewBert`'s code-base. This option might suit you better, if your engineering skills are better than
|
||||
into the `BrandNewBert`'s code-base. This option might suit you better if your engineering skills are better than
|
||||
your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming
|
||||
much more than reading scientific papers.
|
||||
|
||||
@ -175,7 +175,7 @@ theoretical aspects, but rather focus on the practical ones, namely:
|
||||
encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those.
|
||||
- What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
|
||||
summarization?
|
||||
- What is the novel feature of the model making it different from BERT/GPT-2/BART?
|
||||
- What is the novel feature of the model that makes it different from BERT/GPT-2/BART?
|
||||
- Which of the already existing [ЁЯдЧ Transformers models](https://huggingface.co/transformers/#contents) is most
|
||||
similar to *brand_new_bert*?
|
||||
- What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
|
||||
@ -261,7 +261,7 @@ figure out the following:
|
||||
- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you
|
||||
work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm?
|
||||
|
||||
It is very important that before you start the porting process, that you can **efficiently** debug code in the original
|
||||
It is very important that before you start the porting process, you can **efficiently** debug code in the original
|
||||
repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
|
||||
even a pull request in the original repository. The maintainers of this repository are most likely very happy about
|
||||
someone looking into their code!
|
||||
@ -280,10 +280,10 @@ In general, there are two possible debugging environments for running the origin
|
||||
Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
|
||||
logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also,
|
||||
notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
|
||||
Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you to work with them.
|
||||
Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you work with them.
|
||||
|
||||
The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend
|
||||
some time adjusting to the new programming environment and that you might not be able to use your known debugging tools
|
||||
some time adjusting to the new programming environment and you might not be able to use your known debugging tools
|
||||
anymore, like `ipdb`.
|
||||
|
||||
For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
|
||||
@ -329,7 +329,7 @@ example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/
|
||||
very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
|
||||
often relies on verifying print statements.
|
||||
|
||||
No matter which strategy you choose, the recommended procedure is often the same in that you should start to debug the
|
||||
No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the
|
||||
starting layers first and the ending layers last.
|
||||
|
||||
It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
|
||||
@ -364,7 +364,7 @@ depending on the library framework, we accept an error tolerance of 1e-3 (0.001)
|
||||
nearly the same output, they have to be almost identical. Therefore, you will certainly compare the intermediate
|
||||
outputs of the ЁЯдЧ Transformers version multiple times against the intermediate outputs of the original implementation of
|
||||
*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
|
||||
important. Here is some advice is to make your debugging environment as efficient as possible.
|
||||
important. Here is some advice to make your debugging environment as efficient as possible.
|
||||
|
||||
- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
|
||||
probably take the time to write a longer script that decomposes the original model into smaller sub-components to
|
||||
@ -409,7 +409,7 @@ Otherwise, let's start generating a new model. You have two choices here:
|
||||
- `transformers-cli add-new-model-like` to add a new model like an existing one
|
||||
- `transformers-cli add-new-model` to add a new model from our template (will look like BERT or Bart depending on the type of model you select)
|
||||
|
||||
In both cases, you will be prompted with a questionnaire to fill the basic information of your model. The second command requires to install `cookiecutter`, you can find more information on it [here](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
|
||||
In both cases, you will be prompted with a questionnaire to fill in the basic information of your model. The second command requires to install `cookiecutter`, you can find more information on it [here](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model).
|
||||
|
||||
**Open a Pull Request on the main huggingface/transformers repo**
|
||||
|
||||
@ -451,7 +451,7 @@ git push -u origin a-descriptive-name-for-my-changes
|
||||
|
||||
6. Change the PR into a draft by clicking on тАЬConvert to draftтАЭ on the right of the GitHub pull request web page.
|
||||
|
||||
In the following, whenever you have done some progress, don't forget to commit your work and push it to your account so
|
||||
In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so
|
||||
that it shows in the pull request. Additionally, you should make sure to update your work with the current main from
|
||||
time to time by doing:
|
||||
|
||||
@ -483,7 +483,7 @@ Now you can finally start coding :). The generated code in
|
||||
`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if
|
||||
it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
|
||||
you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
|
||||
BART?*". Implement those changes which often means to change the *self-attention* layer, the order of the normalization
|
||||
BART?*". Implement those changes which often means changing the *self-attention* layer, the order of the normalization
|
||||
layer, etcтАж Again, it is often useful to look at the similar architecture of already existing models in Transformers to
|
||||
get a better feeling of how your model should be implemented.
|
||||
|
||||
@ -665,7 +665,7 @@ PyTorch's implementation of a layer requires the weight to be transposed beforeh
|
||||
|
||||
Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
|
||||
were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
|
||||
conversion trials fail with either a wrong shape statement or wrong name assignment. This is most likely because either
|
||||
conversion trials fail with either a wrong shape statement or a wrong name assignment. This is most likely because either
|
||||
you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the ЁЯдЧ Transformers
|
||||
implementation, you have a bug in the `init()` functions of one of the components of the ЁЯдЧ Transformers
|
||||
implementation or you need to transpose one of the checkpoint weights.
|
||||
@ -722,7 +722,7 @@ in the ЁЯдЧ Transformers implementation. From our experience, a simple and effic
|
||||
in both the original implementation and ЁЯдЧ Transformers implementation, at the same positions in the network
|
||||
respectively, and to successively remove print statements showing the same values for intermediate presentations.
|
||||
|
||||
When you're confident that both implementations yield the same output, verifying the outputs with
|
||||
When you're confident that both implementations yield the same output, verify the outputs with
|
||||
`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the
|
||||
work left to be done should be a cakewalk ЁЯШК.
|
||||
|
||||
@ -744,7 +744,7 @@ Having fixed all common tests, it is now crucial to ensure that all the nice wor
|
||||
- b) Future changes to your model will not break any important feature of the model.
|
||||
|
||||
At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
|
||||
you used earlier to implement the model to ЁЯдЧ Transformers. A template of those model tests is already added by the
|
||||
you used earlier to implement the model to ЁЯдЧ Transformers. A template of those model tests has already added by the
|
||||
Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those
|
||||
tests are passing, run
|
||||
|
||||
@ -769,7 +769,7 @@ ways:
|
||||
|
||||
**9. Implement the tokenizer**
|
||||
|
||||
Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent or very similar to an
|
||||
Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent to or very similar to an
|
||||
already existing tokenizer of ЁЯдЧ Transformers.
|
||||
|
||||
It is very important to find/extract the original tokenizer file and to manage to load this file into the ЁЯдЧ
|
||||
@ -890,6 +890,6 @@ reviewer.
|
||||
Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
|
||||
contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
|
||||
used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share
|
||||
your achievement with the community.
|
||||
your achievements with the community.
|
||||
|
||||
**You have made another model that is super easy to access for everyone in the community! ЁЯдп**
|
||||
|
||||
@ -94,10 +94,11 @@ default template for that model class is used instead. Let's take a look at the
|
||||
"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}"
|
||||
```
|
||||
|
||||
That's kind of intimidating. Let's add some newlines and indentation to make it more readable. Note that
|
||||
we remove the first newline after each block as well as any preceding whitespace before a block by default, using the
|
||||
Jinja `trim_blocks` and `lstrip_blocks` flags. This means that you can write your templates with indentations and
|
||||
newlines and still have them function correctly!
|
||||
That's kind of intimidating. Let's add some newlines and indentation to make it more readable. Note that the first
|
||||
newline after each block as well as any preceding whitespace before a block are ignored by default, using the
|
||||
Jinja `trim_blocks` and `lstrip_blocks` flags. However, be cautious - although leading whitespace on each
|
||||
line is stripped, spaces between blocks on the same line are not. We strongly recommend checking that your template
|
||||
isn't printing extra spaces where it shouldn't be!
|
||||
|
||||
```
|
||||
{% for message in messages %}
|
||||
@ -218,10 +219,11 @@ input formats. Our default template for models that don't have a class-specific
|
||||
{% endfor %}
|
||||
```
|
||||
|
||||
If you like this one, here it is in one-liner form, ready to copy into your code:
|
||||
If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes
|
||||
handy support for "generation prompts" - see the next section for more!
|
||||
|
||||
```
|
||||
tokenizer.chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
|
||||
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
||||
```
|
||||
|
||||
This template wraps each message in `<|im_start|>` and `<|im_end|>` tokens, and simply writes the role as a string, which
|
||||
@ -240,6 +242,56 @@ The "user", "system" and "assistant" roles are the standard for chat, and we rec
|
||||
particularly if you want your model to operate well with [`ConversationalPipeline`]. However, you are not limited
|
||||
to these roles - templating is extremely flexible, and any string can be a role.
|
||||
|
||||
## What are "generation prompts"?
|
||||
|
||||
You may notice that the `apply_chat_template` method has an `add_generation_prompt` argument. This argument tells
|
||||
the template to add tokens that indicate the start of a bot response. For example, consider the following chat:
|
||||
|
||||
```python
|
||||
messages = [
|
||||
{"role": "user", "content": "Hi there!"},
|
||||
{"role": "assistant", "content": "Nice to meet you!"},
|
||||
{"role": "user", "content": "Can I ask a question?"}
|
||||
]
|
||||
```
|
||||
|
||||
Here's what this will look like without a generation prompt, using the ChatML template we described above:
|
||||
|
||||
```python
|
||||
>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
||||
"""<|im_start|>user
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Nice to meet you!<|im_end|>
|
||||
<|im_start|>user
|
||||
Can I ask a question?<|im_end|>
|
||||
"""
|
||||
```
|
||||
|
||||
And here's what it looks like **with** a generation prompt:
|
||||
|
||||
```python
|
||||
>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
"""<|im_start|>user
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Nice to meet you!<|im_end|>
|
||||
<|im_start|>user
|
||||
Can I ask a question?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
"""
|
||||
```
|
||||
|
||||
Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model
|
||||
generates text it will write a bot response instead of doing something unexpected, like continuing the user's
|
||||
message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a
|
||||
special kind of text to them! You need to guide them with the appropriate control tokens so they know what they're
|
||||
supposed to be doing.
|
||||
|
||||
Not all models require generation prompts. Some models, like BlenderBot and LLaMA, don't have any
|
||||
special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact
|
||||
effect that `add_generation_prompt` has will depend on the template being used.
|
||||
|
||||
## I want to use chat templates! How should I get started?
|
||||
|
||||
If you have any chat models, you should set their `tokenizer.chat_template` attribute and test it using
|
||||
@ -253,3 +305,63 @@ model, which means it is also automatically supported in places like `Conversati
|
||||
By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of
|
||||
open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long -
|
||||
it's time to put an end to them!
|
||||
|
||||
## Template writing tips
|
||||
|
||||
If you're unfamiliar with Jinja, we generally find that the easiest way to write a chat template is to first
|
||||
write a short Python script that formats messages the way you want, and then convert that script into a template.
|
||||
|
||||
Remember that the template handler will receive the conversation history as a variable called `messages`. Each
|
||||
message is a dictionary with two keys, `role` and `content`. You will be able to access `messages` in your template
|
||||
just like you can in Python, which means you can loop over it with `{% for message in messages %}` or access
|
||||
individual messages with, for example, `{{ messages[0] }}`.
|
||||
|
||||
You can also use the following tips to convert your code to Jinja:
|
||||
|
||||
### For loops
|
||||
|
||||
For loops in Jinja look like this:
|
||||
|
||||
```
|
||||
{% for message in messages %}
|
||||
{{ message['content'] }}
|
||||
{% endfor %}
|
||||
```
|
||||
|
||||
Note that whatever's inside the {{ expression block }} will be printed to the output. You can use operators like
|
||||
`+` to combine strings inside expression blocks.
|
||||
|
||||
### If statements
|
||||
|
||||
If statements in Jinja look like this:
|
||||
|
||||
```
|
||||
{% if message['role'] == 'user' %}
|
||||
{{ message['content'] }}
|
||||
{% endif %}
|
||||
```
|
||||
|
||||
Note how where Python uses whitespace to mark the beginnings and ends of `for` and `if` blocks, Jinja requires you
|
||||
to explicitly end them with `{% endfor %}` and `{% endif %}`.
|
||||
|
||||
### Special variables
|
||||
|
||||
Inside your template, you will have access to the list of `messages`, but you can also access several other special
|
||||
variables. These include special tokens like `bos_token` and `eos_token`, as well as the `add_generation_prompt`
|
||||
variable that we discussed above. You can also use the `loop` variable to access information about the current loop
|
||||
iteration, for example using `{% if loop.last %}` to check if the current message is the last message in the
|
||||
conversation. Here's an example that puts these ideas together to add a generation prompt at the end of the
|
||||
conversation if add_generation_prompt is `True`:
|
||||
|
||||
```
|
||||
{% if loop.last and add_generation_prompt %}
|
||||
{{ bos_token + 'Assistant:\n' }}
|
||||
{% endif %}
|
||||
```
|
||||
|
||||
### Notes on whitespace
|
||||
|
||||
As much as possible, we've tried to get Jinja to ignore whitespace outside of {{ expressions }}. However, be aware
|
||||
that Jinja is a general-purpose templating engine, and it may treat whitespace between blocks on the same line
|
||||
as significant and print it to the output. We **strongly** recommend checking that your template isn't printing extra
|
||||
spaces where it shouldn't be before you upload it!
|
||||
@ -82,7 +82,8 @@ Even if the default decoding strategy mostly works for your task, you can still
|
||||
commonly adjusted parameters include:
|
||||
|
||||
- `max_new_tokens`: the maximum number of tokens to generate. In other words, the size of the output sequence, not
|
||||
including the tokens in the prompt.
|
||||
including the tokens in the prompt. As an alternative to using the output's length as a stopping criteria, you can choose
|
||||
to stop generation whenever the full generation exceeds some amount of time. To learn more, check [`StoppingCriteria`].
|
||||
- `num_beams`: by specifying a number of beams higher than 1, you are effectively switching from greedy search to
|
||||
beam search. This strategy evaluates several hypotheses at each time step and eventually chooses the hypothesis that
|
||||
has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability
|
||||
|
||||
@ -112,6 +112,12 @@ A type of layer in a neural network where the input matrix is multiplied element
|
||||
|
||||
## D
|
||||
|
||||
### DataParallel (DP)
|
||||
|
||||
Parallelism technique for training on multiple GPUs where the same setup is replicated multiple times, with each instance
|
||||
receiving a distinct data slice. The processing is done in parallel and all setups are synchronized at the end of each training step.
|
||||
Learn more about how DataParallel works [here](perf_train_gpu_many#dataparallel-vs-distributeddataparallel).
|
||||
|
||||
### decoder input IDs
|
||||
|
||||
This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
|
||||
@ -340,6 +346,12 @@ A pipeline in ЁЯдЧ Transformers is an abstraction referring to a series of steps
|
||||
|
||||
For more details, see [Pipelines for inference](https://huggingface.co/docs/transformers/pipeline_tutorial).
|
||||
|
||||
### PipelineParallel (PP)
|
||||
|
||||
Parallelism technique in which the model is split up vertically (layer-level) across multiple GPUs, so that only one or
|
||||
several layers of the model are placed on a single GPU. Each GPU processes in parallel different stages of the pipeline
|
||||
and working on a small chunk of the batch. Learn more about how PipelineParallel works [here](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism).
|
||||
|
||||
### pixel values
|
||||
|
||||
A tensor of the numerical representations of an image that is passed to a model. The pixel values have a shape of [`batch_size`, `num_channels`, `height`, `width`], and are generated from an image processor.
|
||||
@ -410,6 +422,10 @@ An example of a semi-supervised learning approach is "self-training", in which a
|
||||
Models that generate a new sequence from an input, like translation models, or summarization models (such as
|
||||
[Bart](model_doc/bart) or [T5](model_doc/t5)).
|
||||
|
||||
### Sharded DDP
|
||||
|
||||
Another name for the foundational [ZeRO](#zero-redundancy-optimizer--zero-) concept as used by various other implementations of ZeRO.
|
||||
|
||||
### stride
|
||||
|
||||
In [convolution](#convolution) or [pooling](#pooling), the stride refers to the distance the kernel is moved over a matrix. A stride of 1 means the kernel is moved one pixel over at a time, and a stride of 2 means the kernel is moved two pixels over at a time.
|
||||
@ -420,6 +436,14 @@ A form of model training that directly uses labeled data to correct and instruct
|
||||
|
||||
## T
|
||||
|
||||
### Tensor Parallelism (TP)
|
||||
|
||||
Parallelism technique for training on multiple GPUs in which each tensor is split up into multiple chunks, so instead of
|
||||
having the whole tensor reside on a single GPU, each shard of the tensor resides on its designated GPU. Shards gets
|
||||
processed separately and in parallel on different GPUs and the results are synced at the end of the processing step.
|
||||
This is what is sometimes called horizontal parallelism, as the splitting happens on horizontal level.
|
||||
Learn more about Tensor Parallelism [here](perf_train_gpu_many#tensor-parallelism).
|
||||
|
||||
### token
|
||||
|
||||
A part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords) or a
|
||||
@ -489,3 +513,12 @@ Self-attention based deep learning model architecture.
|
||||
### unsupervised learning
|
||||
|
||||
A form of model training in which data provided to the model is not labeled. Unsupervised learning techniques leverage statistical information of the data distribution to find patterns useful for the task at hand.
|
||||
|
||||
## Z
|
||||
|
||||
### Zero Redundancy Optimizer (ZeRO)
|
||||
|
||||
Parallelism technique which performs sharding of the tensors somewhat similar to [TensorParallel](#tensorparallel--tp-),
|
||||
except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need
|
||||
to be modified. This method also supports various offloading techniques to compensate for limited GPU memory.
|
||||
Learn more about ZeRO [here](perf_train_gpu_many#zero-data-parallelism).
|
||||
@ -48,240 +48,8 @@ The documentation is organized into five sections:
|
||||
- **MODELS** details the classes and functions related to each model implemented in the library.
|
||||
- **INTERNAL HELPERS** details utility classes and functions used internally.
|
||||
|
||||
### Supported models
|
||||
|
||||
<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
|
||||
|
||||
1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
|
||||
1. **[AltCLIP](model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
|
||||
1. **[Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
|
||||
1. **[Autoformer](model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
|
||||
1. **[Bark](model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
|
||||
1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
||||
1. **[BARThez](model_doc/barthez)** (from ├Йcole polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||
1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||
1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||
1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BioGpt](model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
|
||||
1. **[BiT](model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
|
||||
1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLIP](model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
|
||||
1. **[BLIP-2](model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
|
||||
1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[BridgeTower](model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
|
||||
1. **[BROS](model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
|
||||
1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Su├бrez*, Yoann Dupont, Laurent Romary, ├Йric Villemonte de la Clergerie, Djam├й Seddah and Beno├оt Sagot.
|
||||
1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[Chinese-CLIP](model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
|
||||
1. **[CLAP](model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
|
||||
1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[CLIPSeg](model_doc/clipseg)** (from University of G├╢ttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo L├╝ddecke and Alexander Ecker.
|
||||
1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
|
||||
1. **[CodeLlama](model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozi├иre, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J├йr├йmy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre D├йfossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
|
||||
1. **[Conditional DETR](model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
|
||||
1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||
1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
|
||||
1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CPM-Ant](model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
|
||||
1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
1. **[Deformable DETR](model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
|
||||
1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Herv├й J├йgou.
|
||||
1. **[DePlot](model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
|
||||
1. **[DETA](model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Kr├дhenb├╝hl.
|
||||
1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||
1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||
1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
|
||||
1. **[DINOv2](model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timoth├йe Darcet, Th├йo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Herv├й Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
|
||||
1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
|
||||
1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
|
||||
1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas O─Яuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by Ren├й Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
|
||||
1. **[EfficientFormer](model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
|
||||
1. **[EfficientNet](model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
|
||||
1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EnCodec](model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre D├йfossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
|
||||
1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
|
||||
1. **[ErnieM](model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
|
||||
1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
|
||||
1. **[Falcon](model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
|
||||
1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FLAN-UL2](model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
|
||||
1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Lo├пc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Beno├оt Crabb├й, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[FocalNet](model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
|
||||
1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GIT](model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
|
||||
1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT NeoX Japanese](model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
|
||||
1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[GPT-Sw3](model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey ├Цhman, Fredrik Carlsson, Magnus Sahlgren.
|
||||
1. **[GPTBigCode](model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo Garc├нa del R├нo, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
|
||||
1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
|
||||
1. **[Graphormer](model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
|
||||
1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
|
||||
1. **[HerBERT](model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
|
||||
1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
|
||||
1. **[IDEFICS](model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Lauren├зon, Lucile Saulnier, L├йo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
|
||||
1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[Informer](model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
|
||||
1. **[InstructBLIP](model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
|
||||
1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
|
||||
1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Herv├й J├йgou, Matthijs Douze.
|
||||
1. **[LiLT](model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
|
||||
1. **[LLaMA](model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth├йe Lacroix, Baptiste Rozi├иre, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
|
||||
1. **[Llama2](model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
|
||||
1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by J├╢rg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MarkupLM](model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
|
||||
1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
|
||||
1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MatCha](model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
|
||||
1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[MEGA](model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
|
||||
1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[MGP-STR](model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
|
||||
1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MMS](model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
|
||||
1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MobileNetV1](model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
|
||||
1. **[MobileNetV2](model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
|
||||
1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MobileViTV2](model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
|
||||
1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MPT](model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
|
||||
1. **[MRA](model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
|
||||
1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[MusicGen](model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre D├йfossez.
|
||||
1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
|
||||
1. **[NAT](model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
|
||||
1. **[Nezha](model_doc/nezha)** (from Huawei NoahтАЩs Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
|
||||
1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[NLLB-MOE](model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
|
||||
1. **[Nystr├╢mformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nystr├╢mformer: A Nystr├╢m-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
|
||||
1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
|
||||
1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[PEGASUS-X](model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
|
||||
1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier H├йnaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, Jo├гo Carreira.
|
||||
1. **[Persimmon](model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sa─Яnak Ta┼Я─▒rlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
|
||||
1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[Pix2Struct](model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
|
||||
1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[Pop2Piano](model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
|
||||
1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[PVT](model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
|
||||
1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K├╝ttler, Mike Lewis, Wen-tau Yih, Tim Rockt├дschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, ┼Бukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Doll├бr.
|
||||
1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault F├йvry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
|
||||
1. **[RoCBert](model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
|
||||
1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[RWKV](model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
|
||||
1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[Segment Anything](model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
|
||||
1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechT5](model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
|
||||
1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SwiftFormer](model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
|
||||
1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
|
||||
1. **[Swin2SR](model_doc/swin2sr)** (from University of W├╝rzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
|
||||
1. **[SwitchTransformers](model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
|
||||
1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[Table Transformer](model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
|
||||
1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Pawe┼В Krzysztof Nowak, Thomas M├╝ller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Time Series Transformer](model_doc/time_series_transformer)** (from HuggingFace).
|
||||
1. **[TimeSformer](model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
|
||||
1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
|
||||
1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
|
||||
1. **[UMT5](model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
|
||||
1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[UPerNet](model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
|
||||
1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
|
||||
1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViT Hybrid](model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VitDet](model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
|
||||
1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Doll├бr, Ross Girshick.
|
||||
1. **[ViTMatte](model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
1. **[ViTMSN](model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
|
||||
1. **[VITS](model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
|
||||
1. **[ViViT](model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lu─Нi─З, Cordelia Schmid.
|
||||
1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
|
||||
1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
|
||||
1. **[X-MOD](model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
|
||||
1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzm├бn, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||
1. **[XLM-V](model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
|
||||
1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [тАЛXLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
|
||||
|
||||
### Supported frameworks
|
||||
## Supported models and frameworks
|
||||
|
||||
The table below represents the current support in the library for each of those models, whether they have a Python
|
||||
tokenizer (called "slow"). A "fast" tokenizer backed by the ЁЯдЧ Tokenizers library, whether they have support in Jax (via
|
||||
@ -290,214 +58,246 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
|
||||
|
||||
| Model | PyTorch support | TensorFlow support | Flax Support |
|
||||
|:-----------------------------:|:---------------:|:------------------:|:------------:|
|
||||
| ALBERT | тЬЕ | тЬЕ | тЬЕ |
|
||||
| ALIGN | тЬЕ | тЭМ | тЭМ |
|
||||
| AltCLIP | тЬЕ | тЭМ | тЭМ |
|
||||
| Audio Spectrogram Transformer | тЬЕ | тЭМ | тЭМ |
|
||||
| Autoformer | тЬЕ | тЭМ | тЭМ |
|
||||
| Bark | тЬЕ | тЭМ | тЭМ |
|
||||
| BART | тЬЕ | тЬЕ | тЬЕ |
|
||||
| BEiT | тЬЕ | тЭМ | тЬЕ |
|
||||
| BERT | тЬЕ | тЬЕ | тЬЕ |
|
||||
| Bert Generation | тЬЕ | тЭМ | тЭМ |
|
||||
| BigBird | тЬЕ | тЭМ | тЬЕ |
|
||||
| BigBird-Pegasus | тЬЕ | тЭМ | тЭМ |
|
||||
| BioGpt | тЬЕ | тЭМ | тЭМ |
|
||||
| BiT | тЬЕ | тЭМ | тЭМ |
|
||||
| Blenderbot | тЬЕ | тЬЕ | тЬЕ |
|
||||
| BlenderbotSmall | тЬЕ | тЬЕ | тЬЕ |
|
||||
| BLIP | тЬЕ | тЬЕ | тЭМ |
|
||||
| BLIP-2 | тЬЕ | тЭМ | тЭМ |
|
||||
| BLOOM | тЬЕ | тЭМ | тЬЕ |
|
||||
| BridgeTower | тЬЕ | тЭМ | тЭМ |
|
||||
| BROS | тЬЕ | тЭМ | тЭМ |
|
||||
| CamemBERT | тЬЕ | тЬЕ | тЭМ |
|
||||
| CANINE | тЬЕ | тЭМ | тЭМ |
|
||||
| Chinese-CLIP | тЬЕ | тЭМ | тЭМ |
|
||||
| CLAP | тЬЕ | тЭМ | тЭМ |
|
||||
| CLIP | тЬЕ | тЬЕ | тЬЕ |
|
||||
| CLIPSeg | тЬЕ | тЭМ | тЭМ |
|
||||
| CodeGen | тЬЕ | тЭМ | тЭМ |
|
||||
| CodeLlama | тЬЕ | тЭМ | тЭМ |
|
||||
| Conditional DETR | тЬЕ | тЭМ | тЭМ |
|
||||
| ConvBERT | тЬЕ | тЬЕ | тЭМ |
|
||||
| ConvNeXT | тЬЕ | тЬЕ | тЭМ |
|
||||
| ConvNeXTV2 | тЬЕ | тЭМ | тЭМ |
|
||||
| CPM-Ant | тЬЕ | тЭМ | тЭМ |
|
||||
| CTRL | тЬЕ | тЬЕ | тЭМ |
|
||||
| CvT | тЬЕ | тЬЕ | тЭМ |
|
||||
| Data2VecAudio | тЬЕ | тЭМ | тЭМ |
|
||||
| Data2VecText | тЬЕ | тЭМ | тЭМ |
|
||||
| Data2VecVision | тЬЕ | тЬЕ | тЭМ |
|
||||
| DeBERTa | тЬЕ | тЬЕ | тЭМ |
|
||||
| DeBERTa-v2 | тЬЕ | тЬЕ | тЭМ |
|
||||
| Decision Transformer | тЬЕ | тЭМ | тЭМ |
|
||||
| Deformable DETR | тЬЕ | тЭМ | тЭМ |
|
||||
| DeiT | тЬЕ | тЬЕ | тЭМ |
|
||||
| DETA | тЬЕ | тЭМ | тЭМ |
|
||||
| DETR | тЬЕ | тЭМ | тЭМ |
|
||||
| DiNAT | тЬЕ | тЭМ | тЭМ |
|
||||
| DINOv2 | тЬЕ | тЭМ | тЭМ |
|
||||
| DistilBERT | тЬЕ | тЬЕ | тЬЕ |
|
||||
| DonutSwin | тЬЕ | тЭМ | тЭМ |
|
||||
| DPR | тЬЕ | тЬЕ | тЭМ |
|
||||
| DPT | тЬЕ | тЭМ | тЭМ |
|
||||
| EfficientFormer | тЬЕ | тЬЕ | тЭМ |
|
||||
| EfficientNet | тЬЕ | тЭМ | тЭМ |
|
||||
| ELECTRA | тЬЕ | тЬЕ | тЬЕ |
|
||||
| EnCodec | тЬЕ | тЭМ | тЭМ |
|
||||
| Encoder decoder | тЬЕ | тЬЕ | тЬЕ |
|
||||
| ERNIE | тЬЕ | тЭМ | тЭМ |
|
||||
| ErnieM | тЬЕ | тЭМ | тЭМ |
|
||||
| ESM | тЬЕ | тЬЕ | тЭМ |
|
||||
| FairSeq Machine-Translation | тЬЕ | тЭМ | тЭМ |
|
||||
| Falcon | тЬЕ | тЭМ | тЭМ |
|
||||
| FlauBERT | тЬЕ | тЬЕ | тЭМ |
|
||||
| FLAVA | тЬЕ | тЭМ | тЭМ |
|
||||
| FNet | тЬЕ | тЭМ | тЭМ |
|
||||
| FocalNet | тЬЕ | тЭМ | тЭМ |
|
||||
| Funnel Transformer | тЬЕ | тЬЕ | тЭМ |
|
||||
| GIT | тЬЕ | тЭМ | тЭМ |
|
||||
| GLPN | тЬЕ | тЭМ | тЭМ |
|
||||
| GPT Neo | тЬЕ | тЭМ | тЬЕ |
|
||||
| GPT NeoX | тЬЕ | тЭМ | тЭМ |
|
||||
| GPT NeoX Japanese | тЬЕ | тЭМ | тЭМ |
|
||||
| GPT-J | тЬЕ | тЬЕ | тЬЕ |
|
||||
| GPT-Sw3 | тЬЕ | тЬЕ | тЬЕ |
|
||||
| GPTBigCode | тЬЕ | тЭМ | тЭМ |
|
||||
| GPTSAN-japanese | тЬЕ | тЭМ | тЭМ |
|
||||
| Graphormer | тЬЕ | тЭМ | тЭМ |
|
||||
| GroupViT | тЬЕ | тЬЕ | тЭМ |
|
||||
| Hubert | тЬЕ | тЬЕ | тЭМ |
|
||||
| I-BERT | тЬЕ | тЭМ | тЭМ |
|
||||
| IDEFICS | тЬЕ | тЭМ | тЭМ |
|
||||
| ImageGPT | тЬЕ | тЭМ | тЭМ |
|
||||
| Informer | тЬЕ | тЭМ | тЭМ |
|
||||
| InstructBLIP | тЬЕ | тЭМ | тЭМ |
|
||||
| Jukebox | тЬЕ | тЭМ | тЭМ |
|
||||
| LayoutLM | тЬЕ | тЬЕ | тЭМ |
|
||||
| LayoutLMv2 | тЬЕ | тЭМ | тЭМ |
|
||||
| LayoutLMv3 | тЬЕ | тЬЕ | тЭМ |
|
||||
| LED | тЬЕ | тЬЕ | тЭМ |
|
||||
| LeViT | тЬЕ | тЭМ | тЭМ |
|
||||
| LiLT | тЬЕ | тЭМ | тЭМ |
|
||||
| LLaMA | тЬЕ | тЭМ | тЭМ |
|
||||
| Longformer | тЬЕ | тЬЕ | тЭМ |
|
||||
| LongT5 | тЬЕ | тЭМ | тЬЕ |
|
||||
| LUKE | тЬЕ | тЭМ | тЭМ |
|
||||
| LXMERT | тЬЕ | тЬЕ | тЭМ |
|
||||
| M-CTC-T | тЬЕ | тЭМ | тЭМ |
|
||||
| M2M100 | тЬЕ | тЭМ | тЭМ |
|
||||
| Marian | тЬЕ | тЬЕ | тЬЕ |
|
||||
| MarkupLM | тЬЕ | тЭМ | тЭМ |
|
||||
| Mask2Former | тЬЕ | тЭМ | тЭМ |
|
||||
| MaskFormer | тЬЕ | тЭМ | тЭМ |
|
||||
| MaskFormerSwin | тЭМ | тЭМ | тЭМ |
|
||||
| mBART | тЬЕ | тЬЕ | тЬЕ |
|
||||
| MEGA | тЬЕ | тЭМ | тЭМ |
|
||||
| Megatron-BERT | тЬЕ | тЭМ | тЭМ |
|
||||
| MGP-STR | тЬЕ | тЭМ | тЭМ |
|
||||
| MobileBERT | тЬЕ | тЬЕ | тЭМ |
|
||||
| MobileNetV1 | тЬЕ | тЭМ | тЭМ |
|
||||
| MobileNetV2 | тЬЕ | тЭМ | тЭМ |
|
||||
| MobileViT | тЬЕ | тЬЕ | тЭМ |
|
||||
| MobileViTV2 | тЬЕ | тЭМ | тЭМ |
|
||||
| MPNet | тЬЕ | тЬЕ | тЭМ |
|
||||
| MPT | тЬЕ | тЭМ | тЭМ |
|
||||
| MRA | тЬЕ | тЭМ | тЭМ |
|
||||
| MT5 | тЬЕ | тЬЕ | тЬЕ |
|
||||
| MusicGen | тЬЕ | тЭМ | тЭМ |
|
||||
| MVP | тЬЕ | тЭМ | тЭМ |
|
||||
| NAT | тЬЕ | тЭМ | тЭМ |
|
||||
| Nezha | тЬЕ | тЭМ | тЭМ |
|
||||
| NLLB-MOE | тЬЕ | тЭМ | тЭМ |
|
||||
| Nystr├╢mformer | тЬЕ | тЭМ | тЭМ |
|
||||
| OneFormer | тЬЕ | тЭМ | тЭМ |
|
||||
| OpenAI GPT | тЬЕ | тЬЕ | тЭМ |
|
||||
| OpenAI GPT-2 | тЬЕ | тЬЕ | тЬЕ |
|
||||
| OpenLlama | тЬЕ | тЭМ | тЭМ |
|
||||
| OPT | тЬЕ | тЬЕ | тЬЕ |
|
||||
| OWL-ViT | тЬЕ | тЭМ | тЭМ |
|
||||
| Pegasus | тЬЕ | тЬЕ | тЬЕ |
|
||||
| PEGASUS-X | тЬЕ | тЭМ | тЭМ |
|
||||
| Perceiver | тЬЕ | тЭМ | тЭМ |
|
||||
| Persimmon | тЬЕ | тЭМ | тЭМ |
|
||||
| Pix2Struct | тЬЕ | тЭМ | тЭМ |
|
||||
| PLBart | тЬЕ | тЭМ | тЭМ |
|
||||
| PoolFormer | тЬЕ | тЭМ | тЭМ |
|
||||
| Pop2Piano | тЬЕ | тЭМ | тЭМ |
|
||||
| ProphetNet | тЬЕ | тЭМ | тЭМ |
|
||||
| PVT | тЬЕ | тЭМ | тЭМ |
|
||||
| QDQBert | тЬЕ | тЭМ | тЭМ |
|
||||
| RAG | тЬЕ | тЬЕ | тЭМ |
|
||||
| REALM | тЬЕ | тЭМ | тЭМ |
|
||||
| Reformer | тЬЕ | тЭМ | тЭМ |
|
||||
| RegNet | тЬЕ | тЬЕ | тЬЕ |
|
||||
| RemBERT | тЬЕ | тЬЕ | тЭМ |
|
||||
| ResNet | тЬЕ | тЬЕ | тЬЕ |
|
||||
| RetriBERT | тЬЕ | тЭМ | тЭМ |
|
||||
| RoBERTa | тЬЕ | тЬЕ | тЬЕ |
|
||||
| RoBERTa-PreLayerNorm | тЬЕ | тЬЕ | тЬЕ |
|
||||
| RoCBert | тЬЕ | тЭМ | тЭМ |
|
||||
| RoFormer | тЬЕ | тЬЕ | тЬЕ |
|
||||
| RWKV | тЬЕ | тЭМ | тЭМ |
|
||||
| SAM | тЬЕ | тЬЕ | тЭМ |
|
||||
| SegFormer | тЬЕ | тЬЕ | тЭМ |
|
||||
| SEW | тЬЕ | тЭМ | тЭМ |
|
||||
| SEW-D | тЬЕ | тЭМ | тЭМ |
|
||||
| Speech Encoder decoder | тЬЕ | тЭМ | тЬЕ |
|
||||
| Speech2Text | тЬЕ | тЬЕ | тЭМ |
|
||||
| Speech2Text2 | тЭМ | тЭМ | тЭМ |
|
||||
| SpeechT5 | тЬЕ | тЭМ | тЭМ |
|
||||
| Splinter | тЬЕ | тЭМ | тЭМ |
|
||||
| SqueezeBERT | тЬЕ | тЭМ | тЭМ |
|
||||
| SwiftFormer | тЬЕ | тЭМ | тЭМ |
|
||||
| Swin Transformer | тЬЕ | тЬЕ | тЭМ |
|
||||
| Swin Transformer V2 | тЬЕ | тЭМ | тЭМ |
|
||||
| Swin2SR | тЬЕ | тЭМ | тЭМ |
|
||||
| SwitchTransformers | тЬЕ | тЭМ | тЭМ |
|
||||
| T5 | тЬЕ | тЬЕ | тЬЕ |
|
||||
| Table Transformer | тЬЕ | тЭМ | тЭМ |
|
||||
| TAPAS | тЬЕ | тЬЕ | тЭМ |
|
||||
| Time Series Transformer | тЬЕ | тЭМ | тЭМ |
|
||||
| TimeSformer | тЬЕ | тЭМ | тЭМ |
|
||||
| TimmBackbone | тЭМ | тЭМ | тЭМ |
|
||||
| Trajectory Transformer | тЬЕ | тЭМ | тЭМ |
|
||||
| Transformer-XL | тЬЕ | тЬЕ | тЭМ |
|
||||
| TrOCR | тЬЕ | тЭМ | тЭМ |
|
||||
| TVLT | тЬЕ | тЭМ | тЭМ |
|
||||
| UMT5 | тЬЕ | тЭМ | тЭМ |
|
||||
| UniSpeech | тЬЕ | тЭМ | тЭМ |
|
||||
| UniSpeechSat | тЬЕ | тЭМ | тЭМ |
|
||||
| UPerNet | тЬЕ | тЭМ | тЭМ |
|
||||
| VAN | тЬЕ | тЭМ | тЭМ |
|
||||
| VideoMAE | тЬЕ | тЭМ | тЭМ |
|
||||
| ViLT | тЬЕ | тЭМ | тЭМ |
|
||||
| Vision Encoder decoder | тЬЕ | тЬЕ | тЬЕ |
|
||||
| VisionTextDualEncoder | тЬЕ | тЬЕ | тЬЕ |
|
||||
| VisualBERT | тЬЕ | тЭМ | тЭМ |
|
||||
| ViT | тЬЕ | тЬЕ | тЬЕ |
|
||||
| ViT Hybrid | тЬЕ | тЭМ | тЭМ |
|
||||
| VitDet | тЬЕ | тЭМ | тЭМ |
|
||||
| ViTMAE | тЬЕ | тЬЕ | тЭМ |
|
||||
| ViTMatte | тЬЕ | тЭМ | тЭМ |
|
||||
| ViTMSN | тЬЕ | тЭМ | тЭМ |
|
||||
| VITS | тЬЕ | тЭМ | тЭМ |
|
||||
| ViViT | тЬЕ | тЭМ | тЭМ |
|
||||
| Wav2Vec2 | тЬЕ | тЬЕ | тЬЕ |
|
||||
| Wav2Vec2-Conformer | тЬЕ | тЭМ | тЭМ |
|
||||
| WavLM | тЬЕ | тЭМ | тЭМ |
|
||||
| Whisper | тЬЕ | тЬЕ | тЬЕ |
|
||||
| X-CLIP | тЬЕ | тЭМ | тЭМ |
|
||||
| X-MOD | тЬЕ | тЭМ | тЭМ |
|
||||
| XGLM | тЬЕ | тЬЕ | тЬЕ |
|
||||
| XLM | тЬЕ | тЬЕ | тЭМ |
|
||||
| XLM-ProphetNet | тЬЕ | тЭМ | тЭМ |
|
||||
| XLM-RoBERTa | тЬЕ | тЬЕ | тЬЕ |
|
||||
| XLM-RoBERTa-XL | тЬЕ | тЭМ | тЭМ |
|
||||
| XLNet | тЬЕ | тЬЕ | тЭМ |
|
||||
| YOLOS | тЬЕ | тЭМ | тЭМ |
|
||||
| YOSO | тЬЕ | тЭМ | тЭМ |
|
||||
|:------------------------------------------------------------------------:|:---------------:|:------------------:|:------------:|
|
||||
| [ALBERT](model_doc/albert) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [ALIGN](model_doc/align) | тЬЕ | тЭМ | тЭМ |
|
||||
| [AltCLIP](model_doc/altclip) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Autoformer](model_doc/autoformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Bark](model_doc/bark) | тЬЕ | тЭМ | тЭМ |
|
||||
| [BART](model_doc/bart) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BARThez](model_doc/barthez) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BARTpho](model_doc/bartpho) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BEiT](model_doc/beit) | тЬЕ | тЭМ | тЬЕ |
|
||||
| [BERT](model_doc/bert) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [Bert Generation](model_doc/bert-generation) | тЬЕ | тЭМ | тЭМ |
|
||||
| [BertJapanese](model_doc/bert-japanese) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BERTweet](model_doc/bertweet) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BigBird](model_doc/big_bird) | тЬЕ | тЭМ | тЬЕ |
|
||||
| [BigBird-Pegasus](model_doc/bigbird_pegasus) | тЬЕ | тЭМ | тЭМ |
|
||||
| [BioGpt](model_doc/biogpt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [BiT](model_doc/bit) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Blenderbot](model_doc/blenderbot) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BlenderbotSmall](model_doc/blenderbot-small) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BLIP](model_doc/blip) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [BLIP-2](model_doc/blip-2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [BLOOM](model_doc/bloom) | тЬЕ | тЭМ | тЬЕ |
|
||||
| [BORT](model_doc/bort) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [BridgeTower](model_doc/bridgetower) | тЬЕ | тЭМ | тЭМ |
|
||||
| [BROS](model_doc/bros) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ByT5](model_doc/byt5) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [CamemBERT](model_doc/camembert) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [CANINE](model_doc/canine) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Chinese-CLIP](model_doc/chinese_clip) | тЬЕ | тЭМ | тЭМ |
|
||||
| [CLAP](model_doc/clap) | тЬЕ | тЭМ | тЭМ |
|
||||
| [CLIP](model_doc/clip) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [CLIPSeg](model_doc/clipseg) | тЬЕ | тЭМ | тЭМ |
|
||||
| [CodeGen](model_doc/codegen) | тЬЕ | тЭМ | тЭМ |
|
||||
| [CodeLlama](model_doc/code_llama) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Conditional DETR](model_doc/conditional_detr) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ConvBERT](model_doc/convbert) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [ConvNeXT](model_doc/convnext) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [ConvNeXTV2](model_doc/convnextv2) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [CPM](model_doc/cpm) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [CPM-Ant](model_doc/cpmant) | тЬЕ | тЭМ | тЭМ |
|
||||
| [CTRL](model_doc/ctrl) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [CvT](model_doc/cvt) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [Data2VecAudio](model_doc/data2vec) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Data2VecText](model_doc/data2vec) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Data2VecVision](model_doc/data2vec) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [DeBERTa](model_doc/deberta) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [DeBERTa-v2](model_doc/deberta-v2) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [Decision Transformer](model_doc/decision_transformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Deformable DETR](model_doc/deformable_detr) | тЬЕ | тЭМ | тЭМ |
|
||||
| [DeiT](model_doc/deit) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [DePlot](model_doc/deplot) | тЬЕ | тЭМ | тЭМ |
|
||||
| [DETA](model_doc/deta) | тЬЕ | тЭМ | тЭМ |
|
||||
| [DETR](model_doc/detr) | тЬЕ | тЭМ | тЭМ |
|
||||
| [DialoGPT](model_doc/dialogpt) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [DiNAT](model_doc/dinat) | тЬЕ | тЭМ | тЭМ |
|
||||
| [DINOv2](model_doc/dinov2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [DistilBERT](model_doc/distilbert) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [DiT](model_doc/dit) | тЬЕ | тЭМ | тЬЕ |
|
||||
| [DonutSwin](model_doc/donut) | тЬЕ | тЭМ | тЭМ |
|
||||
| [DPR](model_doc/dpr) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [DPT](model_doc/dpt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [EfficientFormer](model_doc/efficientformer) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [EfficientNet](model_doc/efficientnet) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ELECTRA](model_doc/electra) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [EnCodec](model_doc/encodec) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Encoder decoder](model_doc/encoder-decoder) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [ERNIE](model_doc/ernie) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ErnieM](model_doc/ernie_m) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ESM](model_doc/esm) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [FairSeq Machine-Translation](model_doc/fsmt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Falcon](model_doc/falcon) | тЬЕ | тЭМ | тЭМ |
|
||||
| [FLAN-T5](model_doc/flan-t5) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [FLAN-UL2](model_doc/flan-ul2) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [FlauBERT](model_doc/flaubert) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [FLAVA](model_doc/flava) | тЬЕ | тЭМ | тЭМ |
|
||||
| [FNet](model_doc/fnet) | тЬЕ | тЭМ | тЭМ |
|
||||
| [FocalNet](model_doc/focalnet) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Funnel Transformer](model_doc/funnel) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [Fuyu](model_doc/fuyu) | тЬЕ | тЭМ | тЭМ |
|
||||
| [GIT](model_doc/git) | тЬЕ | тЭМ | тЭМ |
|
||||
| [GLPN](model_doc/glpn) | тЬЕ | тЭМ | тЭМ |
|
||||
| [GPT Neo](model_doc/gpt_neo) | тЬЕ | тЭМ | тЬЕ |
|
||||
| [GPT NeoX](model_doc/gpt_neox) | тЬЕ | тЭМ | тЭМ |
|
||||
| [GPT NeoX Japanese](model_doc/gpt_neox_japanese) | тЬЕ | тЭМ | тЭМ |
|
||||
| [GPT-J](model_doc/gptj) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [GPT-Sw3](model_doc/gpt-sw3) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [GPTBigCode](model_doc/gpt_bigcode) | тЬЕ | тЭМ | тЭМ |
|
||||
| [GPTSAN-japanese](model_doc/gptsan-japanese) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Graphormer](model_doc/graphormer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [GroupViT](model_doc/groupvit) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [HerBERT](model_doc/herbert) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [Hubert](model_doc/hubert) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [I-BERT](model_doc/ibert) | тЬЕ | тЭМ | тЭМ |
|
||||
| [IDEFICS](model_doc/idefics) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ImageGPT](model_doc/imagegpt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Informer](model_doc/informer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [InstructBLIP](model_doc/instructblip) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Jukebox](model_doc/jukebox) | тЬЕ | тЭМ | тЭМ |
|
||||
| [KOSMOS-2](model_doc/kosmos-2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [LayoutLM](model_doc/layoutlm) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [LayoutLMv2](model_doc/layoutlmv2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [LayoutLMv3](model_doc/layoutlmv3) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [LayoutXLM](model_doc/layoutxlm) | тЬЕ | тЭМ | тЭМ |
|
||||
| [LED](model_doc/led) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [LeViT](model_doc/levit) | тЬЕ | тЭМ | тЭМ |
|
||||
| [LiLT](model_doc/lilt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [LLaMA](model_doc/llama) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Llama2](model_doc/llama2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Longformer](model_doc/longformer) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [LongT5](model_doc/longt5) | тЬЕ | тЭМ | тЬЕ |
|
||||
| [LUKE](model_doc/luke) | тЬЕ | тЭМ | тЭМ |
|
||||
| [LXMERT](model_doc/lxmert) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [M-CTC-T](model_doc/mctct) | тЬЕ | тЭМ | тЭМ |
|
||||
| [M2M100](model_doc/m2m_100) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Marian](model_doc/marian) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [MarkupLM](model_doc/markuplm) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Mask2Former](model_doc/mask2former) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MaskFormer](model_doc/maskformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MatCha](model_doc/matcha) | тЬЕ | тЭМ | тЭМ |
|
||||
| [mBART](model_doc/mbart) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [mBART-50](model_doc/mbart50) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [MEGA](model_doc/mega) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Megatron-BERT](model_doc/megatron-bert) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Megatron-GPT2](model_doc/megatron_gpt2) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [MGP-STR](model_doc/mgp-str) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Mistral](model_doc/mistral) | тЬЕ | тЭМ | тЭМ |
|
||||
| [mLUKE](model_doc/mluke) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MMS](model_doc/mms) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [MobileBERT](model_doc/mobilebert) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [MobileNetV1](model_doc/mobilenet_v1) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MobileNetV2](model_doc/mobilenet_v2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MobileViT](model_doc/mobilevit) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [MobileViTV2](model_doc/mobilevitv2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MPNet](model_doc/mpnet) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [MPT](model_doc/mpt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MRA](model_doc/mra) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MT5](model_doc/mt5) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [MusicGen](model_doc/musicgen) | тЬЕ | тЭМ | тЭМ |
|
||||
| [MVP](model_doc/mvp) | тЬЕ | тЭМ | тЭМ |
|
||||
| [NAT](model_doc/nat) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Nezha](model_doc/nezha) | тЬЕ | тЭМ | тЭМ |
|
||||
| [NLLB](model_doc/nllb) | тЬЕ | тЭМ | тЭМ |
|
||||
| [NLLB-MOE](model_doc/nllb-moe) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Nougat](model_doc/nougat) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [Nystr├╢mformer](model_doc/nystromformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [OneFormer](model_doc/oneformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [OpenAI GPT](model_doc/openai-gpt) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [OpenAI GPT-2](model_doc/gpt2) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [OpenLlama](model_doc/open-llama) | тЬЕ | тЭМ | тЭМ |
|
||||
| [OPT](model_doc/opt) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [OWL-ViT](model_doc/owlvit) | тЬЕ | тЭМ | тЭМ |
|
||||
| [OWLv2](model_doc/owlv2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Pegasus](model_doc/pegasus) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [PEGASUS-X](model_doc/pegasus_x) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Perceiver](model_doc/perceiver) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Persimmon](model_doc/persimmon) | тЬЕ | тЭМ | тЭМ |
|
||||
| [PhoBERT](model_doc/phobert) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [Pix2Struct](model_doc/pix2struct) | тЬЕ | тЭМ | тЭМ |
|
||||
| [PLBart](model_doc/plbart) | тЬЕ | тЭМ | тЭМ |
|
||||
| [PoolFormer](model_doc/poolformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Pop2Piano](model_doc/pop2piano) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ProphetNet](model_doc/prophetnet) | тЬЕ | тЭМ | тЭМ |
|
||||
| [PVT](model_doc/pvt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [QDQBert](model_doc/qdqbert) | тЬЕ | тЭМ | тЭМ |
|
||||
| [RAG](model_doc/rag) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [REALM](model_doc/realm) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Reformer](model_doc/reformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [RegNet](model_doc/regnet) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [RemBERT](model_doc/rembert) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [ResNet](model_doc/resnet) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [RetriBERT](model_doc/retribert) | тЬЕ | тЭМ | тЭМ |
|
||||
| [RoBERTa](model_doc/roberta) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [RoCBert](model_doc/roc_bert) | тЬЕ | тЭМ | тЭМ |
|
||||
| [RoFormer](model_doc/roformer) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [RWKV](model_doc/rwkv) | тЬЕ | тЭМ | тЭМ |
|
||||
| [SAM](model_doc/sam) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [SeamlessM4T](model_doc/seamless_m4t) | тЬЕ | тЭМ | тЭМ |
|
||||
| [SegFormer](model_doc/segformer) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [SEW](model_doc/sew) | тЬЕ | тЭМ | тЭМ |
|
||||
| [SEW-D](model_doc/sew-d) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Speech Encoder decoder](model_doc/speech-encoder-decoder) | тЬЕ | тЭМ | тЬЕ |
|
||||
| [Speech2Text](model_doc/speech_to_text) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [SpeechT5](model_doc/speecht5) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Splinter](model_doc/splinter) | тЬЕ | тЭМ | тЭМ |
|
||||
| [SqueezeBERT](model_doc/squeezebert) | тЬЕ | тЭМ | тЭМ |
|
||||
| [SwiftFormer](model_doc/swiftformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Swin Transformer](model_doc/swin) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [Swin Transformer V2](model_doc/swinv2) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Swin2SR](model_doc/swin2sr) | тЬЕ | тЭМ | тЭМ |
|
||||
| [SwitchTransformers](model_doc/switch_transformers) | тЬЕ | тЭМ | тЭМ |
|
||||
| [T5](model_doc/t5) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [T5v1.1](model_doc/t5v1.1) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [Table Transformer](model_doc/table-transformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [TAPAS](model_doc/tapas) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [TAPEX](model_doc/tapex) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [Time Series Transformer](model_doc/time_series_transformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [TimeSformer](model_doc/timesformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Trajectory Transformer](model_doc/trajectory_transformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Transformer-XL](model_doc/transfo-xl) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [TrOCR](model_doc/trocr) | тЬЕ | тЭМ | тЭМ |
|
||||
| [TVLT](model_doc/tvlt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [UL2](model_doc/ul2) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [UMT5](model_doc/umt5) | тЬЕ | тЭМ | тЭМ |
|
||||
| [UniSpeech](model_doc/unispeech) | тЬЕ | тЭМ | тЭМ |
|
||||
| [UniSpeechSat](model_doc/unispeech-sat) | тЬЕ | тЭМ | тЭМ |
|
||||
| [UPerNet](model_doc/upernet) | тЬЕ | тЭМ | тЭМ |
|
||||
| [VAN](model_doc/van) | тЬЕ | тЭМ | тЭМ |
|
||||
| [VideoMAE](model_doc/videomae) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ViLT](model_doc/vilt) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Vision Encoder decoder](model_doc/vision-encoder-decoder) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [VisionTextDualEncoder](model_doc/vision-text-dual-encoder) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [VisualBERT](model_doc/visual_bert) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ViT](model_doc/vit) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [ViT Hybrid](model_doc/vit_hybrid) | тЬЕ | тЭМ | тЭМ |
|
||||
| [VitDet](model_doc/vitdet) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ViTMAE](model_doc/vit_mae) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [ViTMatte](model_doc/vitmatte) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ViTMSN](model_doc/vit_msn) | тЬЕ | тЭМ | тЭМ |
|
||||
| [VITS](model_doc/vits) | тЬЕ | тЭМ | тЭМ |
|
||||
| [ViViT](model_doc/vivit) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Wav2Vec2](model_doc/wav2vec2) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [WavLM](model_doc/wavlm) | тЬЕ | тЭМ | тЭМ |
|
||||
| [Whisper](model_doc/whisper) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [X-CLIP](model_doc/xclip) | тЬЕ | тЭМ | тЭМ |
|
||||
| [X-MOD](model_doc/xmod) | тЬЕ | тЭМ | тЭМ |
|
||||
| [XGLM](model_doc/xglm) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [XLM](model_doc/xlm) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [XLM-ProphetNet](model_doc/xlm-prophetnet) | тЬЕ | тЭМ | тЭМ |
|
||||
| [XLM-RoBERTa](model_doc/xlm-roberta) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [XLM-RoBERTa-XL](model_doc/xlm-roberta-xl) | тЬЕ | тЭМ | тЭМ |
|
||||
| [XLM-V](model_doc/xlm-v) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [XLNet](model_doc/xlnet) | тЬЕ | тЬЕ | тЭМ |
|
||||
| [XLS-R](model_doc/xls_r) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2) | тЬЕ | тЬЕ | тЬЕ |
|
||||
| [YOLOS](model_doc/yolos) | тЬЕ | тЭМ | тЭМ |
|
||||
| [YOSO](model_doc/yoso) | тЬЕ | тЭМ | тЭМ |
|
||||
|
||||
<!-- End table-->
|
||||
|
||||
@ -169,28 +169,28 @@ Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hu
|
||||
|
||||
## Offline mode
|
||||
|
||||
ЁЯдЧ Transformers is able to run in a firewalled or offline environment by only using local files. Set the environment variable `TRANSFORMERS_OFFLINE=1` to enable this behavior.
|
||||
Run ЁЯдЧ Transformers in a firewalled or offline environment with locally cached files by setting the environment variable `TRANSFORMERS_OFFLINE=1`.
|
||||
|
||||
<Tip>
|
||||
|
||||
Add [ЁЯдЧ Datasets](https://huggingface.co/docs/datasets/) to your offline training workflow by setting the environment variable `HF_DATASETS_OFFLINE=1`.
|
||||
Add [ЁЯдЧ Datasets](https://huggingface.co/docs/datasets/) to your offline training workflow with the environment variable `HF_DATASETS_OFFLINE=1`.
|
||||
|
||||
</Tip>
|
||||
|
||||
For example, you would typically run a program on a normal network firewalled to external instances with the following command:
|
||||
|
||||
```bash
|
||||
python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
||||
```
|
||||
|
||||
Run this same program in an offline instance with:
|
||||
|
||||
```bash
|
||||
HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
|
||||
python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
||||
```
|
||||
|
||||
The script should now run without hanging or waiting to timeout because it knows it should only look for local files.
|
||||
This script should run without hanging or waiting to timeout because it won't attempt to download the model from the Hub.
|
||||
|
||||
You can also bypass loading a model from the Hub from each [`~PreTrainedModel.from_pretrained`] call with the [`local_files_only`] parameter. When set to `True`, only local files are loaded:
|
||||
|
||||
```py
|
||||
from transformers import T5Model
|
||||
|
||||
model = T5Model.from_pretrained("./path/to/local/directory", local_files_only=True)
|
||||
```
|
||||
|
||||
### Fetch models and tokenizers to use offline
|
||||
|
||||
|
||||
@ -74,14 +74,13 @@ If you're interested in basic LLM usage, our high-level [`Pipeline`](pipeline_tu
|
||||
|
||||
</Tip>
|
||||
|
||||
<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
|
||||
First, you need to load the model.
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForCausalLM
|
||||
|
||||
>>> model = AutoModelForCausalLM.from_pretrained(
|
||||
... "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
|
||||
... "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
|
||||
... )
|
||||
```
|
||||
|
||||
@ -97,18 +96,31 @@ Next, you need to preprocess your text input with a [tokenizer](tokenizer_summar
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
|
||||
>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
|
||||
```
|
||||
|
||||
The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.
|
||||
|
||||
Finally, call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens, which should be converted to text before printing.
|
||||
After tokenizing the inputs, you can call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens. The generated tokens then should be converted to text before printing.
|
||||
|
||||
```py
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'A list of colors: red, blue, green, yellow, black, white, and brown'
|
||||
'A list of colors: red, blue, green, yellow, orange, purple, pink,'
|
||||
```
|
||||
|
||||
Finally, you don't need to do it one sequence at a time! You can batch your inputs, which will greatly improve the throughput at a small latency and memory cost. All you need to do is to make sure you pad your inputs properly (more on that below).
|
||||
|
||||
```py
|
||||
>>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default
|
||||
>>> model_inputs = tokenizer(
|
||||
... ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
|
||||
... ).to("cuda")
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
['A list of colors: red, blue, green, yellow, orange, purple, pink,',
|
||||
'Portugal is a country in southwestern Europe, on the Iber']
|
||||
```
|
||||
|
||||
And that's it! In a few lines of code, you can harness the power of an LLM.
|
||||
@ -121,10 +133,10 @@ There are many [generation strategies](generation_strategies), and sometimes the
|
||||
```py
|
||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
|
||||
>>> tokenizer.pad_token = tokenizer.eos_token # Llama has no pad token by default
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
>>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default
|
||||
>>> model = AutoModelForCausalLM.from_pretrained(
|
||||
... "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
|
||||
... "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
|
||||
... )
|
||||
```
|
||||
|
||||
@ -154,7 +166,7 @@ By default, and unless specified in the [`~generation.GenerationConfig`] file, `
|
||||
```py
|
||||
>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
|
||||
>>> from transformers import set_seed
|
||||
>>> set_seed(0)
|
||||
>>> set_seed(42)
|
||||
|
||||
>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
|
||||
|
||||
@ -166,7 +178,7 @@ By default, and unless specified in the [`~generation.GenerationConfig`] file, `
|
||||
>>> # With sampling, the output becomes more creative!
|
||||
>>> generated_ids = model.generate(**model_inputs, do_sample=True)
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'I am a cat.\nI just need to be. I am always.\nEvery time'
|
||||
'I am a cat. Specifically, I am an indoor-only cat. I'
|
||||
```
|
||||
|
||||
### Wrong padding side
|
||||
@ -175,17 +187,17 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt
|
||||
|
||||
```py
|
||||
>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
|
||||
>>> # which is shorter, has padding on the right side. Generation fails.
|
||||
>>> # which is shorter, has padding on the right side. Generation fails to capture the logic.
|
||||
>>> model_inputs = tokenizer(
|
||||
... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
|
||||
... ).to("cuda")
|
||||
>>> generated_ids = model.generate(**model_inputs)
|
||||
>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
|
||||
''
|
||||
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
'1, 2, 33333333333'
|
||||
|
||||
>>> # With left-padding, it works as expected!
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
|
||||
>>> tokenizer.pad_token = tokenizer.eos_token # Llama has no pad token by default
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
|
||||
>>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default
|
||||
>>> model_inputs = tokenizer(
|
||||
... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
|
||||
... ).to("cuda")
|
||||
@ -194,26 +206,61 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt
|
||||
'1, 2, 3, 4, 5, 6,'
|
||||
```
|
||||
|
||||
<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->
|
||||
### Wrong prompt
|
||||
|
||||
Some models and tasks expect a certain input prompt format to work properly. When this format is not applied, you will get a silent performance degradation: the model kinda works, but not as well as if you were following the expected prompt. More information about prompting, including which models and tasks need to be careful, is available in this [guide](tasks/prompting). Let's see an example with a chat LLM, which makes use of [chat templating](chat_templating):
|
||||
|
||||
```python
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
|
||||
>>> model = AutoModelForCausalLM.from_pretrained(
|
||||
... "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
|
||||
... )
|
||||
>>> set_seed(0)
|
||||
>>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
|
||||
>>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
|
||||
>>> input_length = model_inputs.input_ids.shape[1]
|
||||
>>> generated_ids = model.generate(**model_inputs, max_new_tokens=20)
|
||||
>>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
|
||||
"I'm not a thug, but i can tell you that a human cannot eat"
|
||||
>>> # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write
|
||||
>>> # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`)
|
||||
|
||||
>>> set_seed(0)
|
||||
>>> messages = [
|
||||
... {
|
||||
... "role": "system",
|
||||
... "content": "You are a friendly chatbot who always responds in the style of a thug",
|
||||
... },
|
||||
... {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
|
||||
... ]
|
||||
>>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
||||
>>> input_length = model_inputs.shape[1]
|
||||
>>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
|
||||
>>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
|
||||
'None, you thug. How bout you try to focus on more useful questions?'
|
||||
>>> # As we can see, it followed a proper thug style ЁЯШО
|
||||
```
|
||||
|
||||
## Further resources
|
||||
|
||||
While the autoregressive generation process is relatively straightforward, making the most out of your LLM can be a challenging endeavor because there are many moving parts. For your next steps to help you dive deeper into LLM usage and understanding:
|
||||
|
||||
<!-- TODO: complete with new guides -->
|
||||
### Advanced generate usage
|
||||
|
||||
1. [Guide](generation_strategies) on how to control different generation methods, how to set up the generation configuration file, and how to stream the output;
|
||||
2. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils).
|
||||
2. [Guide](chat_templating) on the prompt template for chat LLMs;
|
||||
3. [Guide](tasks/prompting) on to get the most of prompt design;
|
||||
4. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils).
|
||||
|
||||
### LLM leaderboards
|
||||
|
||||
1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), which focuses on the quality of the open-source models;
|
||||
2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), which focuses on LLM throughput.
|
||||
|
||||
### Latency and throughput
|
||||
### Latency, throughput and memory utilization
|
||||
|
||||
1. [Guide](main_classes/quantization) on dynamic quantization, which shows you how to drastically reduce your memory requirements.
|
||||
1. [Guide](llm_tutorial_optimization) on how to optimize LLMs for speed and memory;
|
||||
2. [Guide](main_classes/quantization) on quantization such as bitsandbytes and autogptq, which shows you how to drastically reduce your memory requirements.
|
||||
|
||||
### Related libraries
|
||||
|
||||
|
||||
739
docs/source/en/llm_tutorial_optimization.md
Normal file
739
docs/source/en/llm_tutorial_optimization.md
Normal file
@ -0,0 +1,739 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
-->
|
||||
# Optimizing LLMs for Speed and Memory
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Large Language Models (LLMs) such as GPT3/4, [Falcon](https://huggingface.co/tiiuae/falcon-40b), and [Llama](https://huggingface.co/meta-llama/Llama-2-70b-hf) are rapidly advancing in their ability to tackle human-centric tasks, establishing themselves as essential tools in modern knowledge-based industries.
|
||||
Deploying these models in real-world tasks remains challenging, however:
|
||||
|
||||
- To exhibit near-human text understanding and generation capabilities, LLMs currently require to be composed of billions of parameters (see [Kaplan et al](https://arxiv.org/abs/2001.08361), [Wei et. al](https://arxiv.org/abs/2206.07682)). This consequently amplifies the memory demands for inference.
|
||||
- In many real-world tasks, LLMs need to be given extensive contextual information. This necessitates the model's capability to manage very long input sequences during inference.
|
||||
|
||||
The crux of these challenges lies in augmenting the computational and memory capabilities of LLMs, especially when handling expansive input sequences.
|
||||
|
||||
In this guide, we will go over the effective techniques for efficient LLM deployment:
|
||||
|
||||
1. **Lower Precision**: Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.
|
||||
|
||||
2. **Flash Attention:** Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization.
|
||||
|
||||
3. **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://arxiv.org/abs/2108.12409), [Rotary embeddings](https://arxiv.org/abs/2104.09864), [Multi-Query Attention (MQA)](https://arxiv.org/abs/1911.02150) and [Grouped-Query-Attention (GQA)]((https://arxiv.org/abs/2305.13245)).
|
||||
|
||||
Throughout this guide, we will offer an analysis of auto-regressive generation from a tensor's perspective. We delve into the pros and cons of adopting lower precision, provide a comprehensive exploration of the latest attention algorithms, and discuss improved LLM architectures. While doing so, we run practical examples showcasing each of the feature improvements.
|
||||
|
||||
## 1. Lower Precision
|
||||
|
||||
Memory requirements of LLMs can be best understood by seeing the LLM as a set of weight matrices and vectors and the text inputs as a sequence of vectors. In the following, the definition *weights* will be used to signify all model weight matrices and vectors.
|
||||
|
||||
At the time of writing this guide, LLMs consist of at least a couple billion parameters. Each parameter thereby is made of a decimal number, e.g. `4.5689` which is usually stored in either [float32](https://en.wikipedia.org/wiki/Single-precision_floating-point_format), [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format), or [float16](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) format. This allows us to easily compute the memory requirement to load the LLM into memory:
|
||||
|
||||
> *Loading the weights of a model having X billion parameters requires roughly 4 * X GB of VRAM in float32 precision*
|
||||
|
||||
Nowadays, models are however rarely trained in full float32 precision, but usually in bfloat16 precision or less frequently in float16 precision. Therefore the rule of thumb becomes:
|
||||
|
||||
> *Loading the weights of a model having X billion parameters requires roughly 2 * X GB of VRAM in bfloat16/float16 precision*
|
||||
|
||||
For shorter text inputs (less than 1024 tokens), the memory requirement for inference is very much dominated by the memory requirement to load the weights. Therefore, for now, let's assume that the memory requirement for inference is equal to the memory requirement to load the model into the GPU VRAM.
|
||||
|
||||
To give some examples of how much VRAM it roughly takes to load a model in bfloat16:
|
||||
|
||||
- **GPT3** requires 2 \* 175 GB = **350 GB** VRAM
|
||||
- [**Bloom**](https://huggingface.co/bigscience/bloom) requires 2 \* 176 GB = **352 GB** VRAM
|
||||
- [**Llama-2-70b**](https://huggingface.co/meta-llama/Llama-2-70b-hf) requires 2 \* 70 GB = **140 GB** VRAM
|
||||
- [**Falcon-40b**](https://huggingface.co/tiiuae/falcon-40b) requires 2 \* 40 GB = **80 GB** VRAM
|
||||
- [**MPT-30b**](https://huggingface.co/mosaicml/mpt-30b) requires 2 \* 30 GB = **60 GB** VRAM
|
||||
- [**bigcode/starcoder**](https://huggingface.co/bigcode/starcoder) requires 2 \* 15.5 = **31 GB** VRAM
|
||||
|
||||
As of writing this document, the largest GPU chip on the market is the A100 & H100 offering 80GB of VRAM. Most of the models listed before require more than 80GB just to be loaded and therefore necessarily require [tensor parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#tensor-parallelism) and/or [pipeline parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
|
||||
|
||||
ЁЯдЧ Transformers does not support tensor parallelism out of the box as it requires the model architecture to be written in a specific way. If you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
|
||||
|
||||
Naive pipeline parallelism is supported out of the box. For this, simply load the model with `device="auto"` which will automatically place the different layers on the available GPUs as explained [here](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
|
||||
Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. For this more advanced pipeline parallelism is required as explained [here](https://huggingface.co/docs/transformers/v4.34.0/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
|
||||
|
||||
If you have access to an 8 x 80GB A100 node, you could load BLOOM as follows
|
||||
|
||||
```bash
|
||||
!pip install transformers accelerate bitsandbytes optimum
|
||||
```
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", pad_token_id=0)
|
||||
```
|
||||
|
||||
By using `device_map="auto"` the attention layers would be equally distributed over all available GPUs.
|
||||
|
||||
In this guide, we will use [bigcode/octocoder](https://huggingface.co/bigcode/octocoder) as it can be run on a single 40 GB A100 GPU device chip. Note that all memory and speed optimizations that we will apply going forward, are equally applicable to models that require model or tensor parallelism.
|
||||
|
||||
Since the model is loaded in bfloat16 precision, using our rule of thumb above, we would expect the memory requirement to run inference with `bigcode/octocoder` to be around 31 GB VRAM. Let's give it a try.
|
||||
|
||||
We first load the model and tokenizer and then pass both to Transformers' [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) object.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
||||
import torch
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
|
||||
|
||||
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
||||
```
|
||||
|
||||
```python
|
||||
prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:"
|
||||
|
||||
result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
|
||||
result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
|
||||
```
|
||||
|
||||
Nice, we can now directly use the result to convert bytes into Gigabytes.
|
||||
|
||||
```python
|
||||
def bytes_to_giga_bytes(bytes):
|
||||
return bytes / 1024 / 1024 / 1024
|
||||
```
|
||||
|
||||
Let's call [`torch.cuda.max_memory_allocated`](https://pytorch.org/docs/stable/generated/torch.cuda.max_memory_allocated.html) to measure the peak GPU memory allocation.
|
||||
|
||||
```python
|
||||
bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```bash
|
||||
29.0260648727417
|
||||
```
|
||||
|
||||
Close enough to our back-of-the-envelope computation! We can see the number is not exactly correct as going from bytes to kilobytes requires a multiplication of 1024 instead of 1000. Therefore the back-of-the-envelope formula can also be understood as an "at most X GB" computation.
|
||||
Note that if we had tried to run the model in full float32 precision, a whopping 64 GB of VRAM would have been required.
|
||||
|
||||
> Almost all models are trained in bfloat16 nowadays, there is no reason to run the model in full float32 precision if [your GPU supports bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). Float32 won't give better inference results than the precision that was used to train the model.
|
||||
|
||||
If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under `"torch_dtype"`, *e.g.* [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). It is recommended to set the model to the same precision type as written in the config when loading with `from_pretrained(..., torch_dtype=...)` except when the original type is float32 in which case one can use both `float16` or `bfloat16` for inference.
|
||||
|
||||
|
||||
Let's define a `flush(...)` function to free all allocated memory so that we can accurately measure the peak allocated GPU memory.
|
||||
|
||||
```python
|
||||
del pipe
|
||||
del model
|
||||
|
||||
import gc
|
||||
import torch
|
||||
|
||||
def flush():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
```
|
||||
|
||||
Let's call it now for the next experiment.
|
||||
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
In the recent version of the accelerate library, you can also use an utility method called `release_memory()`
|
||||
|
||||
```python
|
||||
from accelerate.utils import release_memory
|
||||
# ...
|
||||
|
||||
release_memory(model)
|
||||
```
|
||||
|
||||
Now what if your GPU does not have 32 GB of VRAM? It has been found that model weights can be quantized to 8-bit or 4-bits without a significant loss in performance (see [Dettmers et al.](https://arxiv.org/abs/2208.07339)).
|
||||
Model can be quantized to even 3 or 2 bits with an acceptable loss in performance as shown in the recent [GPTQ paper](https://arxiv.org/abs/2210.17323) ЁЯдп.
|
||||
|
||||
Without going into too many details, quantization schemes aim at reducing the precision of weights while trying to keep the model's inference results as accurate as possible (*a.k.a* as close as possible to bfloat16).
|
||||
Note that quantization works especially well for text generation since all we care about is choosing the *set of most likely next tokens* and don't really care about the exact values of the next token *logit* distribution.
|
||||
All that matters is that the next token *logit* distribution stays roughly the same so that an `argmax` or `topk` operation gives the same results.
|
||||
|
||||
There are various quantization techniques, which we won't discuss in detail here, but in general, all quantization techniques work as follows:
|
||||
|
||||
- 1. Quantize all weights to the target precision
|
||||
- 2. Load the quantized weights, and pass the input sequence of vectors in bfloat16 precision
|
||||
- 3. Dynamically dequantize weights to bfloat16 to perform the computation with their input vectors in bfloat16 precision
|
||||
|
||||
In a nutshell, this means that *inputs-weight matrix* multiplications, with \\( X \\) being the *inputs*, \\( W \\) being a weight matrix and \\( Y \\) being the output:
|
||||
|
||||
$$ Y = X * W $$
|
||||
|
||||
are changed to
|
||||
|
||||
$$ Y = X * \text{dequantize}(W) $$
|
||||
|
||||
for every matrix multiplication. Dequantization and re-quantization is performed sequentially for all weight matrices as the inputs run through the network graph.
|
||||
|
||||
Therefore, inference time is often **not** reduced when using quantized weights, but rather increases.
|
||||
Enough theory, let's give it a try! To quantize the weights with Transformers, you need to make sure that
|
||||
the [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library is installed.
|
||||
|
||||
```bash
|
||||
!pip install bitsandbytes
|
||||
```
|
||||
|
||||
We can then load models in 8-bit quantization by simply adding a `load_in_8bit=True` flag to `from_pretrained`.
|
||||
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_8bit=True, pad_token_id=0)
|
||||
```
|
||||
|
||||
Now, let's run our example again and measure the memory usage.
|
||||
|
||||
```python
|
||||
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
||||
|
||||
result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
|
||||
result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
|
||||
```
|
||||
|
||||
Nice, we're getting the same result as before, so no loss in accuracy! Let's look at how much memory was used this time.
|
||||
|
||||
```python
|
||||
bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
15.219234466552734
|
||||
```
|
||||
|
||||
Significantly less! We're down to just a bit over 15 GBs and could therefore run this model on consumer GPUs like the 4090.
|
||||
We're seeing a very nice gain in memory efficiency and more or less no degradation to the model's output. However, we can also notice a slight slow-down during inference.
|
||||
|
||||
|
||||
We delete the models and flush the memory again.
|
||||
```python
|
||||
del model
|
||||
del pipe
|
||||
```
|
||||
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
|
||||
Let's see what peak GPU memory consumption 4-bit quantization gives. Quantizing the model to 4-bit can be done with the same API as before - this time by passing `load_in_4bit=True` instead of `load_in_8bit=True`.
|
||||
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
|
||||
|
||||
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
||||
|
||||
result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
|
||||
result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
Here is a Python function that transforms bytes to Giga bytes:\n\n```\ndef bytes_to_gigabytes(bytes):\n return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single argument
|
||||
```
|
||||
|
||||
We're almost seeing the same output text as before - just the `python` is missing just before the code snippet. Let's see how much memory was required.
|
||||
|
||||
```python
|
||||
bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
9.543574333190918
|
||||
```
|
||||
|
||||
Just 9.5GB! That's really not a lot for a >15 billion parameter model.
|
||||
|
||||
While we see very little degradation in accuracy for our model here, 4-bit quantization can in practice often lead to different results compared to 8-bit quantization or full `bfloat16` inference. It is up to the user to try it out.
|
||||
|
||||
Also note that inference here was again a bit slower compared to 8-bit quantization which is due to the more aggressive quantization method used for 4-bit quantization leading to \\( \text{quantize} \\) and \\( \text{dequantize} \\) taking longer during inference.
|
||||
|
||||
```python
|
||||
del model
|
||||
del pipe
|
||||
```
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
|
||||
Overall, we saw that running OctoCoder in 8-bit precision reduced the required GPU VRAM from 32G GPU VRAM to only 15GB and running the model in 4-bit precision further reduces the required GPU VRAM to just a bit over 9GB.
|
||||
|
||||
4-bit quantization allows the model to be run on GPUs such as RTX3090, V100, and T4 which are quite accessible for most people.
|
||||
|
||||
For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60) implementation.
|
||||
|
||||
> As a conclusion, it is important to remember that model quantization trades improved memory efficiency against accuracy and in some cases inference time.
|
||||
|
||||
If GPU memory is not a constraint for your use case, there is often no need to look into quantization. However many GPUs simply can't run LLMs without quantization methods and in this case, 4-bit and 8-bit quantization schemes are extremely useful tools.
|
||||
|
||||
For more in-detail usage information, we strongly recommend taking a look at the [Transformers Quantization Docs](https://huggingface.co/docs/transformers/main_classes/quantization#general-usage).
|
||||
Next, let's look into how we can improve computational and memory efficiency by using better algorithms and an improved model architecture.
|
||||
|
||||
# 2. Flash Attention
|
||||
|
||||
Today's top-performing LLMs share more or less the same fundamental architecture that consists of feed-forward layers, activation layers, layer normalization layers, and most crucially, self-attention layers.
|
||||
|
||||
Self-attention layers are central to Large Language Models (LLMs) in that they enable the model to understand the contextual relationships between input tokens.
|
||||
However, the peak GPU memory consumption for self-attention layers grows *quadratically* both in compute and memory complexity with number of input tokens (also called *sequence length*) that we denote in the following by \\( N \\) .
|
||||
While this is not really noticeable for shorter input sequences (of up to 1000 input tokens), it becomes a serious problem for longer input sequences (at around 16000 input tokens).
|
||||
|
||||
Let's take a closer look. The formula to compute the output \\( \mathbf{O} \\) of a self-attention layer for an input \\( \mathbf{X} \\) of length \\( N \\) is:
|
||||
|
||||
$$ \textbf{O} = \text{Attn}(\mathbf{X}) = \mathbf{V} \times \text{Softmax}(\mathbf{QK}^T) \text{ with } \mathbf{Q} = \mathbf{W}_q \mathbf{X}, \mathbf{V} = \mathbf{W}_v \mathbf{X}, \mathbf{K} = \mathbf{W}_k \mathbf{X} $$
|
||||
|
||||
\\( \mathbf{X} = (\mathbf{x}_1, ... \mathbf{x}_{N}) \\) is thereby the input sequence to the attention layer. The projections \\( \mathbf{Q} \\) and \\( \mathbf{K} \\) will each consist of \\( N \\) vectors resulting in the \\( \mathbf{QK}^T \\) being of size \\( N^2 \\) .
|
||||
|
||||
LLMs usually have multiple attention heads, thus doing multiple self-attention computations in parallel.
|
||||
Assuming, the LLM has 40 attention heads and runs in bfloat16 precision, we can calculate the memory requirement to store the \\( \mathbf{QK^T} \\) matrices to be \\( 40 * 2 * N^2 \\) bytes. For \\( N=1000 \\) only around 50 MB of VRAM are needed, however, for \\( N=16000 \\) we would need 19 GB of VRAM, and for \\( N=100,000 \\) we would need almost 1TB just to store the \\( \mathbf{QK}^T \\) matrices.
|
||||
|
||||
Long story short, the default self-attention algorithm quickly becomes prohibitively memory-expensive for large input contexts.
|
||||
|
||||
As LLMs improve in text comprehension and generation, they are applied to increasingly complex tasks. While models once handled the translation or summarization of a few sentences, they now manage entire pages, demanding the capability to process extensive input lengths.
|
||||
|
||||
How can we get rid of the exorbitant memory requirements for large input lengths? We need a new way to compute the self-attention mechanism that gets rid of the \\( QK^T \\) matrix. [Tri Dao et al.](https://arxiv.org/abs/2205.14135) developed exactly such a new algorithm and called it **Flash Attention**.
|
||||
|
||||
In a nutshell, Flash Attention breaks the \\(\mathbf{V} \times \text{Softmax}(\mathbf{QK}^T\\)) computation apart and instead computes smaller chunks of the output by iterating over multiple softmax computation steps:
|
||||
|
||||
$$ \textbf{O}_i \leftarrow s^a_{ij} * \textbf{O}_i + s^b_{ij} * \mathbf{V}_{j} \times \text{Softmax}(\mathbf{QK}^T_{i,j}) \text{ for multiple } i, j \text{ iterations} $$
|
||||
|
||||
with \\( s^a_{ij} \\) and \\( s^b_{ij} \\) being some softmax normalization statistics that need to be recomputed for every \\( i \\) and \\( j \\) .
|
||||
|
||||
Please note that the whole Flash Attention is a bit more complex and is greatly simplified here as going in too much depth is out of scope for this guide. The reader is invited to take a look at the well-written [Flash Attention paper](https://arxiv.org/abs/2205.14135) for more details.
|
||||
|
||||
The main takeaway here is:
|
||||
|
||||
> By keeping track of softmax normalization statistics and by using some smart mathematics, Flash Attention gives **numerical identical** outputs compared to the default self-attention layer at a memory cost that only increases linearly with \\( N \\) .
|
||||
|
||||
Looking at the formula, one would intuitively say that Flash Attention must be much slower compared to the default self-attention formula as more computation needs to be done. Indeed Flash Attention requires more FLOPs compared to normal attention as the softmax normalization statistics have to constantly be recomputed (see [paper](https://arxiv.org/abs/2205.14135) for more details if interested)
|
||||
|
||||
> However, Flash Attention is much faster in inference compared to default attention which comes from its ability to significantly reduce the demands on the slower, high-bandwidth memory of the GPU (VRAM), focusing instead on the faster on-chip memory (SRAM).
|
||||
|
||||
Essentially, Flash Attention makes sure that all intermediate write and read operations can be done using the fast *on-chip* SRAM memory instead of having to access the slower VRAM memory to compute the output vector \\( \mathbf{O} \\) .
|
||||
|
||||
In practice, there is currently absolutely no reason to **not** use Flash Attention if available. The algorithm gives mathematically the same outputs, and is both faster and more memory-efficient.
|
||||
|
||||
Let's look at a practical example.
|
||||
|
||||
Our OctoCoder model now gets a significantly longer input prompt which includes a so-called *system prompt*. System prompts are used to steer the LLM into a better assistant that is tailored to the users' task.
|
||||
In the following, we use a system prompt that will make OctoCoder a better coding assistant.
|
||||
|
||||
```python
|
||||
system_prompt = """Below are a series of dialogues between various people and an AI technical assistant.
|
||||
The assistant tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble but knowledgeable.
|
||||
The assistant is happy to help with code questions and will do their best to understand exactly what is needed.
|
||||
It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer.
|
||||
That said, the assistant is practical really does its best, and doesn't let caution get too much in the way of being useful.
|
||||
|
||||
The Starcoder models are a series of 15.5B parameter models trained on 80+ programming languages from The Stack (v1.2) (excluding opt-out requests).
|
||||
The model uses Multi Query Attention, was trained using the Fill-in-the-Middle objective, and with 8,192 tokens context window for a trillion tokens of heavily deduplicated data.
|
||||
|
||||
-----
|
||||
|
||||
Question: Write a function that takes two lists and returns a list that has alternating elements from each input list.
|
||||
|
||||
Answer: Sure. Here is a function that does that.
|
||||
|
||||
def alternating(list1, list2):
|
||||
results = []
|
||||
for i in range(len(list1)):
|
||||
results.append(list1[i])
|
||||
results.append(list2[i])
|
||||
return results
|
||||
|
||||
Question: Can you write some test cases for this function?
|
||||
|
||||
Answer: Sure, here are some tests.
|
||||
|
||||
assert alternating([10, 20, 30], [1, 2, 3]) == [10, 1, 20, 2, 30, 3]
|
||||
assert alternating([True, False], [4, 5]) == [True, 4, False, 5]
|
||||
assert alternating([], []) == []
|
||||
|
||||
Question: Modify the function so that it returns all input elements when the lists have uneven length. The elements from the longer list should be at the end.
|
||||
|
||||
Answer: Here is the modified function.
|
||||
|
||||
def alternating(list1, list2):
|
||||
results = []
|
||||
for i in range(min(len(list1), len(list2))):
|
||||
results.append(list1[i])
|
||||
results.append(list2[i])
|
||||
if len(list1) > len(list2):
|
||||
results.extend(list1[i+1:])
|
||||
else:
|
||||
results.extend(list2[i+1:])
|
||||
return results
|
||||
|
||||
-----
|
||||
"""
|
||||
```
|
||||
For demonstration purposes, we duplicate the system prompt by ten so that the input length is long enough to observe Flash Attention's memory savings.
|
||||
We append the original text prompt `"Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"`
|
||||
|
||||
```python
|
||||
long_prompt = 10 * system_prompt + prompt
|
||||
```
|
||||
|
||||
We instantiate our model again in bfloat16 precision.
|
||||
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
|
||||
|
||||
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
||||
```
|
||||
|
||||
Let's now run the model just like before *without Flash Attention* and measure the peak GPU memory requirement and inference time.
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
|
||||
|
||||
print(f"Generated in {time.time() - start_time} seconds.")
|
||||
result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
Generated in 10.96854019165039 seconds.
|
||||
Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
|
||||
````
|
||||
|
||||
We're getting the same output as before, however this time, the model repeats the answer multiple times until it's 60 tokens cut-off. This is not surprising as we've repeated the system prompt ten times for demonstration purposes and thus cued the model to repeat itself.
|
||||
|
||||
**Note** that the system prompt should not be repeated ten times in real-world applications - one time is enough!
|
||||
|
||||
Let's measure the peak GPU memory requirement.
|
||||
|
||||
```python
|
||||
bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```bash
|
||||
37.668193340301514
|
||||
```
|
||||
|
||||
As we can see the peak GPU memory requirement is now significantly higher than in the beginning, which is largely due to the longer input sequence. Also the generation takes a little over a minute now.
|
||||
|
||||
We call `flush()` to free GPU memory for our next experiment.
|
||||
|
||||
```python
|
||||
flush()
|
||||
```
|
||||
|
||||
For comparison, let's run the same function, but enable Flash Attention instead.
|
||||
To do so, we convert the model to [BetterTransformers](https://huggingface.co/docs/optimum/bettertransformer/overview) and by doing so enabling PyTorch's [SDPA self-attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) which in turn is based on Flash Attention.
|
||||
|
||||
```python
|
||||
model.to_bettertransformer()
|
||||
```
|
||||
|
||||
Now we run the exact same code snippet as before and under the hood Transformers will make use of Flash Attention.
|
||||
|
||||
```py
|
||||
start_time = time.time()
|
||||
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
|
||||
result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
|
||||
|
||||
print(f"Generated in {time.time() - start_time} seconds.")
|
||||
result
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
Generated in 3.0211617946624756 seconds.
|
||||
Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
|
||||
```
|
||||
|
||||
We're getting the exact same result as before, but can observe a very significant speed-up thanks to Flash Attention.
|
||||
|
||||
Let's measure the memory consumption one last time.
|
||||
|
||||
```python
|
||||
bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
32.617331981658936
|
||||
```
|
||||
|
||||
And we're almost back to our original 29GB peak GPU memory from the beginning.
|
||||
|
||||
We can observe that we only use roughly 100MB more GPU memory when passing a very long input sequence with Flash Attention compared to passing a short input sequence as done in the beginning.
|
||||
|
||||
```py
|
||||
flush()
|
||||
```
|
||||
For more information on how to use Flash Attention, please have a look at [this doc page](https://huggingface.co/docs/transformers/v4.34.0/en/perf_infer_gpu_one#flash-attention-2).
|
||||
## 3. Architectural Innovations
|
||||
|
||||
So far we have looked into improving computational and memory efficiency by:
|
||||
|
||||
- Casting the weights to a lower precision format
|
||||
- Replacing the self-attention algorithm with a more memory- and compute efficient version
|
||||
|
||||
Let's now look into how we can change the architecture of an LLM so that it is most effective and efficient for task that require long text inputs, *e.g.*:
|
||||
- Retrieval augmented Questions Answering,
|
||||
- Summarization,
|
||||
- Chat
|
||||
|
||||
Note that *chat* not only requires the LLM to handle long text inputs, but it also necessitates that the LLM is able to efficiently handle the back-and-forth dialogue between user and assistant (such as ChatGPT).
|
||||
|
||||
Once trained, the fundamental LLM architecture is difficult to change, so it is important to make considerations about the LLM's tasks beforehand and accordingly optimize the model's architecture.
|
||||
There are two important components of the model architecture that quickly become memory and/or performance bottlenecks for large input sequences.
|
||||
|
||||
- The positional embeddings
|
||||
- The key-value cache
|
||||
|
||||
Let's go over each component in more detail
|
||||
|
||||
### 3.1 Improving positional embeddings of LLMs
|
||||
|
||||
Self-attention puts each token in relation to each other's tokens.
|
||||
As an example, the \\( \text{Softmax}(\mathbf{QK}^T) \\) matrix of the text input sequence *"Hello", "I", "love", "you"* could look as follows:
|
||||
|
||||

|
||||
|
||||
Each word token is given a probability mass at which it attends all other word tokens and, therefore is put into relation with all other word tokens. E.g. the word *"love"* attends to the word *"Hello"* with 5%, to *"I"* with 30%, and to itself with 65%.
|
||||
|
||||
A LLM based on self-attention, but without position embeddings would have great difficulties in understanding the positions of the text inputs to each other.
|
||||
This is because the probability score computed by \\( \mathbf{QK}^T \\) relates each word token to each other word token in \\( O(1) \\) computations regardless of their relative positional distance to each other.
|
||||
Therefore, for the LLM without position embeddings each token appears to have the same distance to all other tokens, *e.g.* differentiating between *"Hello I love you"* and *"You love I hello"* would be very challenging.
|
||||
|
||||
For the LLM to understand sentence order, an additional *cue* is needed and is usually applied in the form of *positional encodings* (or also called *positional embeddings*).
|
||||
Positional encodings, encode the position of each token into a numerical presentation that the LLM can leverage to better understand sentence order.
|
||||
|
||||
The authors of the [*Attention Is All You Need*](https://arxiv.org/abs/1706.03762) paper introduced sinusoidal positional embeddings \\( \mathbf{P} = \mathbf{p}_1, \ldots, \mathbf{p}_N \\) .
|
||||
where each vector \\( \mathbf{p}_i \\) is computed as a sinusoidal function of its position \\( i \\) .
|
||||
The positional encodings are then simply added to the input sequence vectors \\( \mathbf{\hat{X}} = \mathbf{\hat{x}}_1, \ldots, \mathbf{\hat{x}}_N \\) = \\( \mathbf{x}_1 + \mathbf{p}_1, \ldots, \mathbf{x}_N + \mathbf{p}_N \\) thereby cueing the model to better learn sentence order.
|
||||
|
||||
Instead of using fixed position embeddings, others (such as [Devlin et al.](https://arxiv.org/abs/1810.04805)) used learned positional encodings for which the positional embeddings
|
||||
\\( \mathbf{P} \\) are learned during training.
|
||||
|
||||
Sinusoidal and learned position embeddings used to be the predominant methods to encode sentence order into LLMs, but a couple of problems related to these positional encodings were found:
|
||||
|
||||
1. Sinusoidal and learned position embeddings are both absolute positional embeddings, *i.e.* encoding a unique embedding for each position id: \\( 0, \ldots, N \\) . As shown by [Huang et al.](https://arxiv.org/abs/2009.13658) and [Su et al.](https://arxiv.org/abs/2104.09864), absolute positional embeddings lead to poor LLM performance for long text inputs. For long text inputs, it is advantageous if the model learns the relative positional distance input tokens have to each other instead of their absolute position.
|
||||
2. When using learned position embeddings, the LLM has to be trained on a fixed input length \\( N \\), which makes it difficult to extrapolate to an input length longer than what it was trained on.
|
||||
|
||||
Recently, relative positional embeddings that can tackle the above mentioned problems have become more popular, most notably:
|
||||
|
||||
- [Rotary Position Embedding (RoPE)](https://arxiv.org/abs/2104.09864)
|
||||
- [ALiBi](https://arxiv.org/abs/2108.12409)
|
||||
|
||||
Both *RoPE* and *ALiBi* argue that it's best to cue the LLM about sentence order directly in the self-attention algorithm as it's there that word tokens are put into relation with each other. More specifically, sentence order should be cued by modifying the \\( \mathbf{QK}^T \\) computation.
|
||||
|
||||
Without going into too many details, *RoPE* notes that positional information can be encoded into query-key pairs, *e.g.* \\( \mathbf{q}_i \\) and \\( \mathbf{x}_j \\) by rotating each vector by an angle \\( \theta * i \\) and \\( \theta * j \\) respectively with \\( i, j \\) describing each vectors sentence position:
|
||||
|
||||
$$ \mathbf{\hat{q}}_i^T \mathbf{\hat{x}}_j = \mathbf{{q}}_i^T \mathbf{R}_{\theta, i -j} \mathbf{{x}}_j. $$
|
||||
|
||||
\\( \mathbf{R}_{\theta, i - j} \\) thereby represents a rotational matrix. \\( \theta \\) is *not* learned during training, but instead set to a pre-defined value that depends on the maximum input sequence length during training.
|
||||
|
||||
> By doing so, the propability score between \\( \mathbf{q}_i \\) and \\( \mathbf{q}_j \\) is only affected if \\( i \ne j \\) and solely depends on the relative distance \\( i - j \\) regardless of each vector's specific positions \\( i \\) and \\( j \\) .
|
||||
|
||||
*RoPE* is used in multiple of today's most important LLMs, such as:
|
||||
|
||||
- [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
|
||||
- [**Llama**](https://arxiv.org/abs/2302.13971)
|
||||
- [**PaLM**](https://arxiv.org/abs/2204.02311)
|
||||
|
||||
As an alternative, *ALiBi* proposes a much simpler relative position encoding scheme. The relative distance that input tokens have to each other is added as a negative integer scaled by a pre-defined value `m` to each query-key entry of the \\( \mathbf{QK}^T \\) matrix right before the softmax computation.
|
||||
|
||||

|
||||
|
||||
As shown in the [ALiBi](https://arxiv.org/abs/2108.12409) paper, this simple relative positional encoding allows the model to retain a high performance even at very long text input sequences.
|
||||
|
||||
*ALiBi* is used in multiple of today's most important LLMs, such as:
|
||||
|
||||
- [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
|
||||
- [**BLOOM**](https://huggingface.co/bigscience/bloom)
|
||||
|
||||
Both *RoPE* and *ALiBi* position encodings can extrapolate to input lengths not seen during training whereas it has been shown that extrapolation works much better out-of-the-box for *ALiBi* as compared to *RoPE*.
|
||||
For ALiBi, one simply increases the values of the lower triangular position matrix to match the length of the input sequence.
|
||||
For *RoPE*, keeping the same \\( \theta \\) that was used during training leads to poor results when passing text inputs much longer than those seen during training, *c.f* [Press et al.](https://arxiv.org/abs/2108.12409). However, the community has found a couple of effective tricks that adapt \\( \theta \\), thereby allowing *RoPE* position embeddings to work well for extrapolated text input sequences (see [here](https://github.com/huggingface/transformers/pull/24653)).
|
||||
|
||||
> Both RoPE and ALiBi are relative positional embeddings that are *not* learned during training, but instead are based on the following intuitions:
|
||||
- Positional cues about the text inputs should be given directly to the \\( QK^T \\) matrix of the self-attention layer
|
||||
- The LLM should be incentivized to learn a constant *relative* distance positional encodings have to each other
|
||||
- The further text input tokens are from each other, the lower the probability of their query-value probability. Both RoPE and ALiBi lower the query-key probability of tokens far away from each other. RoPE by decreasing their vector product by increasing the angle between the query-key vectors. ALiBi by adding large negative numbers to the vector product
|
||||
|
||||
In conclusion, LLMs that are intended to be deployed in tasks that require handling large text inputs are better trained with relative positional embeddings, such as RoPE and ALiBi. Also note that even if an LLM with RoPE and ALiBi has been trained only on a fixed length of say \\( N_1 = 2048 \\) it can still be used in practice with text inputs much larger than \\( N_1 \\), like \\( N_2 = 8192 > N_1 \\) by extrapolating the positional embeddings.
|
||||
|
||||
### 3.2 The key-value cache
|
||||
|
||||
Auto-regressive text generation with LLMs works by iteratively putting in an input sequence, sampling the next token, appending the next token to the input sequence, and continuing to do so until the LLM produces a token that signifies that the generation has finished.
|
||||
|
||||
Please have a look at [Transformer's Generate Text Tutorial](https://huggingface.co/docs/transformers/llm_tutorial#generate-text) to get a more visual explanation of how auto-regressive generation works.
|
||||
|
||||
Let's run a quick code snippet to show how auto-regressive works in practice. We will simply take the most likely next token via `torch.argmax`.
|
||||
|
||||
```python
|
||||
input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
|
||||
|
||||
for _ in range(5):
|
||||
next_logits = model(input_ids)["logits"][:, -1:]
|
||||
next_token_id = torch.argmax(next_logits,dim=-1)
|
||||
|
||||
input_ids = torch.cat([input_ids, next_token_id], dim=-1)
|
||||
print("shape of input_ids", input_ids.shape)
|
||||
|
||||
generated_text = tokenizer.batch_decode(input_ids[:, -5:])
|
||||
generated_text
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
shape of input_ids torch.Size([1, 21])
|
||||
shape of input_ids torch.Size([1, 22])
|
||||
shape of input_ids torch.Size([1, 23])
|
||||
shape of input_ids torch.Size([1, 24])
|
||||
shape of input_ids torch.Size([1, 25])
|
||||
[' Here is a Python function']
|
||||
```
|
||||
|
||||
As we can see every time we increase the text input tokens by the just sampled token.
|
||||
|
||||
With very few exceptions, LLMs are trained using the [causal language modeling objective](https://huggingface.co/docs/transformers/tasks/language_modeling#causal-language-modeling) and therefore mask the upper triangle matrix of the attention score - this is why in the two diagrams above the attention scores are left blank (*a.k.a* have 0 probability). For a quick recap on causal language modeling you can refer to the [*Illustrated Self Attention blog*](https://jalammar.github.io/illustrated-gpt2/#part-2-illustrated-self-attention).
|
||||
|
||||
As a consequence, tokens *never* depend on previous tokens, more specifically the \\( \mathbf{q}_i \\) vector is never put in relation with any key, values vectors \\( \mathbf{k}_j, \mathbf{v}_j \\) if \\( j > i \\) . Instead \\( \mathbf{q}_i \\) only attends to previous key-value vectors \\( \mathbf{k}_{m < i}, \mathbf{v}_{m < i} \text{ , for } m \in \{0, \ldots i - 1\} \\). In order to reduce unnecessary computation, one can therefore cache each layer's key-value vectors for all previous timesteps.
|
||||
|
||||
In the following, we will tell the LLM to make use of the key-value cache by retrieving and forwarding it for each forward pass.
|
||||
In Transformers, we can retrieve the key-value cache by passing the `use_cache` flag to the `forward` call and can then pass it with the current token.
|
||||
|
||||
```python
|
||||
past_key_values = None # past_key_values is the key-value cache
|
||||
generated_tokens = []
|
||||
next_token_id = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
|
||||
|
||||
for _ in range(5):
|
||||
next_logits, past_key_values = model(next_token_id, past_key_values=past_key_values, use_cache=True).to_tuple()
|
||||
next_logits = next_logits[:, -1:]
|
||||
next_token_id = torch.argmax(next_logits, dim=-1)
|
||||
|
||||
print("shape of input_ids", next_token_id.shape)
|
||||
print("length of key-value cache", len(past_key_values[0][0])) # past_key_values are of shape [num_layers, 0 for k, 1 for v, batch_size, length, hidden_dim]
|
||||
generated_tokens.append(next_token_id.item())
|
||||
|
||||
generated_text = tokenizer.batch_decode(generated_tokens)
|
||||
generated_text
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
length of key-value cache 20
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
length of key-value cache 21
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
length of key-value cache 22
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
length of key-value cache 23
|
||||
shape of input_ids torch.Size([1, 1])
|
||||
length of key-value cache 24
|
||||
[' Here', ' is', ' a', ' Python', ' function']
|
||||
```
|
||||
|
||||
As one can see, when using the key-value cache the text input tokens are *not* increased in length, but remain a single input vector. The length of the key-value cache on the other hand is increased by one at every decoding step.
|
||||
|
||||
> Making use of the key-value cache means that the \\( \mathbf{QK}^T \\) is essentially reduced to \\( \mathbf{q}_c\mathbf{K}^T \\) with \\( \mathbf{q}_c \\) being the query projection of the currently passed input token which is *always* just a single vector.
|
||||
|
||||
Using the key-value cache has two advantages:
|
||||
- Significant increase in computational efficiency as less computations are performed compared to computing the full \\( \mathbf{QK}^T \\) matrix. This leads to an increase in inference speed
|
||||
- The maximum required memory is not increased quadratically with the number of generated tokens, but only increases linearly.
|
||||
|
||||
> One should *always* make use of the key-value cache as it leads to identical results and a significant speed-up for longer input sequences. Transformers has the key-value cache enabled by default when making use of the text pipeline or the [`generate` method](https://huggingface.co/docs/transformers/main_classes/text_generation).
|
||||
|
||||
Note that the key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. Let's look at an example.
|
||||
|
||||
```
|
||||
User: How many people live in France?
|
||||
Assistant: Roughly 75 million people live in France
|
||||
User: And how many are in Germany?
|
||||
Assistant: Germany has ca. 81 million inhabitants
|
||||
```
|
||||
|
||||
In this chat, the LLM runs auto-regressive decoding twice:
|
||||
- 1. The first time, the key-value cache is empty and the input prompt is `"User: How many people live in France?"` and the model auto-regressively generates the text `"Roughly 75 million people live in France"` while increasing the key-value cache at every decoding step.
|
||||
- 2. The second time the input prompt is `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?"`. Thanks to the cache, all key-value vectors for the first two sentences are already computed. Therefore the input prompt only consists of `"User: And how many in Germany?"`. While processing the shortened input prompt, it's computed key-value vectors are concatenated to the key-value cache of the first decoding. The second Assistant's answer `"Germany has ca. 81 million inhabitants"` is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?"`.
|
||||
|
||||
Two things should be noted here:
|
||||
1. Keeping all the context is crucial for LLMs deployed in chat so that the LLM understands all the previous context of the conversation. E.g. for the example above the LLM needs to understand that the user refers to the population when asking `"And how many are in Germany"`.
|
||||
2. The key-value cache is extremely useful for chat as it allows us to continuously grow the encoded chat history instead of having to re-encode the chat history again from scratch (as e.g. would be the case when using an encoder-decoder architecture).
|
||||
|
||||
There is however one catch. While the required peak memory for the \\( \mathbf{QK}^T \\) matrix is significantly reduced, holding the key-value cache in memory can become very memory expensive for long input sequences or multi-turn chat. Remember that the key-value cache needs to store the key-value vectors for all previous input vectors \\( \mathbf{x}_i \text{, for } i \in \{1, \ldots, c - 1\} \\) for all self-attention layers and for all attention heads.
|
||||
|
||||
Let's compute the number of float values that need to be stored in the key-value cache for the LLM `bigcode/octocoder` that we used before.
|
||||
The number of float values amounts to two times the sequence length times the number of attention heads times the attention head dimension and times the number of layers.
|
||||
Computing this for our LLM at a hypothetical input sequence length of 16000 gives:
|
||||
|
||||
```python
|
||||
config = model.config
|
||||
2 * 16_000 * config.n_layer * config.n_head * config.n_embd // config.n_head
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
7864320000
|
||||
```
|
||||
|
||||
Roughly 8 billion float values! Storing 8 billion float values in `float16` precision requires around 15 GB of RAM which is circa half as much as the model weights themselves!
|
||||
Researchers have proposed two methods that allow to significantly reduce the memory cost of storing the key-value cache:
|
||||
|
||||
1. [Multi-Query-Attention (MQA)](https://arxiv.org/abs/1911.02150)
|
||||
|
||||
Multi-Query-Attention was proposed in Noam Shazeer's *Fast Transformer Decoding: One Write-Head is All You Need* paper. As the title says, Noam found out that instead of using `n_head` key-value projections weights, one can use a single head-value projection weight pair that is shared across all attention heads without that the model's performance significantly degrades.
|
||||
|
||||
> By using a single head-value projection weight pair, the key value vectors \\( \mathbf{k}_i, \mathbf{v}_i \\) have to be identical across all attention heads which in turn means that we only need to store 1 key-value projection pair in the cache instead of `n_head` ones.
|
||||
|
||||
As most LLMs use between 20 and 100 attention heads, MQA significantly reduces the memory consumption of the key-value cache. For the LLM used in this notebook we could therefore reduce the required memory consumption from 15 GB to less than 400 MB at an input sequence length of 16000.
|
||||
|
||||
In addition to memory savings, MQA also leads to improved computational efficiency as explained in the following.
|
||||
In auto-regressive decoding, large key-value vectors need to be reloaded, concatenated with the current key-value vector pair to be then fed into the \\( \mathbf{q}_c\mathbf{K}^T \\) computation at every step. For auto-regressive decoding, the required memory bandwidth for the constant reloading can become a serious time bottleneck. By reducing the size of the key-value vectors less memory needs to be accessed, thus reducing the memory bandwidth bottleneck. For more detail, please have a look at [Noam's paper](https://arxiv.org/abs/1911.02150).
|
||||
|
||||
The important part to understand here is that reducing the number of key-value attention heads to 1 only makes sense if a key-value cache is used. The peak memory consumption of the model for a single forward pass without key-value cache stays unchanged as every attention head still has a unique query vector so that each attention head still has a different \\( \mathbf{QK}^T \\) matrix.
|
||||
|
||||
MQA has seen wide adoption by the community and is now used by many of the most popular LLMs:
|
||||
|
||||
- [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
|
||||
- [**PaLM**](https://arxiv.org/abs/2204.02311)
|
||||
- [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
|
||||
- [**BLOOM**](https://huggingface.co/bigscience/bloom)
|
||||
|
||||
Also, the checkpoint used in this notebook - `bigcode/octocoder` - makes use of MQA.
|
||||
|
||||
2. [Grouped-Query-Attention (GQA)](https://arxiv.org/abs/2305.13245)
|
||||
|
||||
Grouped-Query-Attention, as proposed by Ainslie et al. from Google, found that using MQA can often lead to quality degradation compared to using vanilla multi-key-value head projections. The paper argues that more model performance can be kept by less drastically reducing the number of query head projection weights. Instead of using just a single key-value projection weight, `n < n_head` key-value projection weights should be used. By choosing `n` to a significantly smaller value than `n_head`, such as 2,4 or 8 almost all of the memory and speed gains from MQA can be kept while sacrificing less model capacity and thus arguably less performance.
|
||||
|
||||
Moreover, the authors of GQA found out that existing model checkpoints can be *uptrained* to have a GQA architecture with as little as 5% of the original pre-training compute. While 5% of the original pre-training compute can still be a massive amount, GQA *uptraining* allows existing checkpoints to be useful for longer input sequences.
|
||||
|
||||
GQA was only recently proposed which is why there is less adoption at the time of writing this notebook.
|
||||
The most notable application of GQA is [Llama-v2](https://huggingface.co/meta-llama/Llama-2-70b-hf).
|
||||
|
||||
> As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat.
|
||||
|
||||
## Conclusion
|
||||
|
||||
The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. As an example, one such promising research direction is [speculative decoding](https://arxiv.org/abs/2211.17192) where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. Going into more detail is out of the scope of this notebook, but can be read upon in this [nice blog post](https://huggingface.co/blog/assisted-generation).
|
||||
|
||||
The reason massive LLMs such as GPT3/4, Llama-2-70b, Claude, PaLM can run so quickly in chat-interfaces such as [Hugging Face Chat](https://huggingface.co/chat/) or ChatGPT is to a big part thanks to the above-mentioned improvements in precision, algorithms, and architecture.
|
||||
Going forward, accelerators such as GPUs, TPUs, etc... will only get faster and allow for more memory, but one should nevertheless always make sure to use the best available algorithms and architectures to get the most bang for your buck ЁЯдЧ
|
||||
@ -25,7 +25,7 @@ Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] obje
|
||||
cannot change anything in the training loop. For customizations that require changes in the training loop, you should
|
||||
subclass [`Trainer`] and override the methods you need (see [trainer](trainer) for examples).
|
||||
|
||||
By default a [`Trainer`] will use the following callbacks:
|
||||
By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] will use the following callbacks.
|
||||
|
||||
- [`DefaultFlowCallback`] which handles the default behavior for logging, saving and evaluation.
|
||||
- [`PrinterCallback`] or [`ProgressCallback`] to display progress and print the
|
||||
@ -45,6 +45,8 @@ By default a [`Trainer`] will use the following callbacks:
|
||||
- [`~integrations.DagsHubCallback`] if [dagshub](https://dagshub.com/) is installed.
|
||||
- [`~integrations.FlyteCallback`] if [flyte](https://flyte.org/) is installed.
|
||||
|
||||
If a package is installed but you don't wish to use the accompanying integration, you can change `TrainingArguments.report_to` to a list of just those integrations you want to use (e.g. `["azure_ml", "wandb"]`).
|
||||
|
||||
The main class that implements callbacks is [`TrainerCallback`]. It gets the
|
||||
[`TrainingArguments`] used to instantiate the [`Trainer`], can access that
|
||||
Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
|
||||
|
||||
@ -1224,6 +1224,7 @@ As long as you don't enable `offload_optimizer` you can mix and match DeepSpeed
|
||||
optimizers, with the exception of using the combination of HuggingFace scheduler and DeepSpeed optimizer:
|
||||
|
||||
| Combos | HF Scheduler | DS Scheduler |
|
||||
|:-------------|:-------------|:-------------|
|
||||
| HF Optimizer | Yes | Yes |
|
||||
| DS Optimizer | No | Yes |
|
||||
|
||||
|
||||
@ -16,10 +16,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Feature Extractor
|
||||
|
||||
A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction
|
||||
from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
|
||||
*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
|
||||
tensors.
|
||||
A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction from sequences, e.g., pre-processing audio files to generate Log-Mel Spectrogram features, feature extraction from images, e.g., cropping image files, but also padding, normalization, and conversion to NumPy, PyTorch, and TensorFlow tensors.
|
||||
|
||||
|
||||
## FeatureExtractionMixin
|
||||
|
||||
@ -71,6 +71,23 @@ verbose to the most verbose), those levels (with their corresponding int values
|
||||
|
||||
By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
|
||||
|
||||
## `logging` vs `warnings`
|
||||
|
||||
Python has two logging systems that are often used in conjunction: `logging`, which is explained above, and `warnings`,
|
||||
which allows further classification of warnings in specific buckets, e.g., `FutureWarning` for a feature or path
|
||||
that has already been deprecated and `DeprecationWarning` to indicate an upcoming deprecation.
|
||||
|
||||
We use both in the `transformers` library. We leverage and adapt `logging`'s `captureWarning` method to allow
|
||||
management of these warning messages by the verbosity setters above.
|
||||
|
||||
What does that mean for developers of the library? We should respect the following heuristic:
|
||||
- `warnings` should be favored for developers of the library and libraries dependent on `transformers`
|
||||
- `logging` should be used for end-users of the library using it in every-day projects
|
||||
|
||||
See reference of the `captureWarnings` method below.
|
||||
|
||||
[[autodoc]] logging.captureWarnings
|
||||
|
||||
## Base setters
|
||||
|
||||
[[autodoc]] logging.set_verbosity_error
|
||||
|
||||
@ -44,6 +44,7 @@ an optional `attentions` attribute. Here we have the `loss` since we passed alon
|
||||
|
||||
When passing `output_hidden_states=True` you may expect the `outputs.hidden_states[-1]` to match `outputs.last_hidden_states` exactly.
|
||||
However, this is not always the case. Some models apply normalization or subsequent process to the last hidden state when it's returned.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
|
||||
@ -352,6 +352,12 @@ Pipelines available for computer vision tasks include the following.
|
||||
- __call__
|
||||
- all
|
||||
|
||||
### ImageToImagePipeline
|
||||
|
||||
[[autodoc]] ImageToImagePipeline
|
||||
- __call__
|
||||
- all
|
||||
|
||||
### ObjectDetectionPipeline
|
||||
|
||||
[[autodoc]] ObjectDetectionPipeline
|
||||
@ -475,6 +481,12 @@ Pipelines available for multimodal tasks include the following.
|
||||
- __call__
|
||||
- all
|
||||
|
||||
### MaskGenerationPipeline
|
||||
|
||||
[[autodoc]] MaskGenerationPipeline
|
||||
- __call__
|
||||
- all
|
||||
|
||||
### VisualQuestionAnsweringPipeline
|
||||
|
||||
[[autodoc]] VisualQuestionAnsweringPipeline
|
||||
|
||||
@ -16,11 +16,102 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Quantize ЁЯдЧ Transformers models
|
||||
|
||||
## AWQ integration
|
||||
|
||||
AWQ method has been introduced in the [*AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration* paper](https://arxiv.org/abs/2306.00978). With AWQ you can run models in 4-bit precision, while preserving its original quality (i.e. no performance degradation) with a superior throughput that other quantization methods presented below - reaching similar throughput as pure `float16` inference.
|
||||
|
||||
We now support inference with any AWQ model, meaning anyone can load and use AWQ weights that are pushed on the Hub or saved locally. Note that using AWQ requires to have access to a NVIDIA GPU. CPU inference is not supported yet.
|
||||
|
||||
### Quantizing a model
|
||||
|
||||
We advise users to look at different existing tools in the ecosystem to quantize their models with AWQ algorithm, such as:
|
||||
|
||||
- [`llm-awq`](https://github.com/mit-han-lab/llm-awq) from MIT Han Lab
|
||||
- [`autoawq`](https://github.com/casper-hansen/AutoAWQ) from [`casper-hansen`](https://github.com/casper-hansen)
|
||||
- Intel neural compressor from Intel - through [`optimum-intel`](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc)
|
||||
|
||||
Many other tools might exist in the ecosystem, please feel free to open a PR to add them to the list.
|
||||
Currently the integration with ЁЯдЧ Transformers is only available for models that have been quantized using `autoawq` library and `llm-awq`. Most of the models quantized with `auto-awq` can be found under [`TheBloke`](https://huggingface.co/TheBloke) namespace of ЁЯдЧ Hub, and to quantize models with `llm-awq` please refer to the [`convert_to_hf.py`](https://github.com/mit-han-lab/llm-awq/blob/main/examples/convert_to_hf.py) script in the examples folder of [`llm-awq`](https://github.com/mit-han-lab/llm-awq/).
|
||||
|
||||
### Load a quantized model
|
||||
|
||||
You can load a quantized model from the Hub using the `from_pretrained` method. Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model's configuration file (`configuration.json`). You can confirm that the model is quantized in the AWQ format by checking the field `quantization_config.quant_method` which should be set to `"awq"`. Note that loading the model will set other weights in `float16` by default for performance reasons. If you want to change that behavior, you can pass `torch_dtype` argument to `torch.float32` or `torch.bfloat16`. You can find in the sections below some example snippets and notebook.
|
||||
|
||||
## Example usage
|
||||
|
||||
First, you need to install [`autoawq`](https://github.com/casper-hansen/AutoAWQ) library
|
||||
|
||||
```bash
|
||||
pip install autoawq
|
||||
```
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model_id = "TheBloke/zephyr-7B-alpha-AWQ"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
|
||||
```
|
||||
|
||||
In case you first load your model on CPU, make sure to move it to your GPU device before using
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model_id = "TheBloke/zephyr-7B-alpha-AWQ"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda:0")
|
||||
```
|
||||
|
||||
### Combining AWQ and Flash Attention
|
||||
|
||||
You can combine AWQ quantization with Flash Attention to get a model that is both quantized and faster. Simply load the model using `from_pretrained` and pass `use_flash_attention_2=True` argument.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", use_flash_attention_2=True, device_map="cuda:0")
|
||||
```
|
||||
|
||||
### Benchmarks
|
||||
|
||||
We performed some speed, throughput and latency benchmarks using [`optimum-benchmark`](https://github.com/huggingface/optimum-benchmark) library.
|
||||
|
||||
Note at that time of writing this documentation section, the available quantization methods were: `awq`, `gptq` and `bitsandbytes`.
|
||||
|
||||
The benchmark was run on a NVIDIA-A100 instance and the model used was [`TheBloke/Mistral-7B-v0.1-AWQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-AWQ) for the AWQ model, [`TheBloke/Mistral-7B-v0.1-GPTQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-GPTQ) for the GPTQ model. We also benchmarked it against `bitsandbytes` quantization methods and native `float16` model. Some results are shown below:
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_memory_plot.png">
|
||||
</div>
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_memory_plot.png">
|
||||
</div>
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_throughput_plot.png">
|
||||
</div>
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_latency_plot.png">
|
||||
</div>
|
||||
|
||||
You can find the full results together with packages versions in [this link](https://github.com/huggingface/optimum-benchmark/tree/main/examples/running-mistral).
|
||||
|
||||
From the results it appears that AWQ quantization method is the fastest quantization method for inference, text generation and among the lowest peak memory for text generation. However, AWQ seems to have the largest forward latency per batch size.
|
||||
|
||||
### Google colab demo
|
||||
|
||||
Check out how to use this integration throughout this [Google Colab demo](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)!
|
||||
|
||||
### AwqConfig
|
||||
|
||||
[[autodoc]] AwqConfig
|
||||
|
||||
## `AutoGPTQ` Integration
|
||||
|
||||
ЁЯдЧ Transformers has integrated `optimum` API to perform GPTQ quantization on language models. You can load and quantize your model in 8, 4, 3 or even 2 bits without a big drop of performance and faster inference speed! This is supported by most GPU hardwares.
|
||||
|
||||
To learn more about the the quantization model, check out:
|
||||
To learn more about the quantization model, check out:
|
||||
- the [GPTQ](https://arxiv.org/pdf/2210.17323.pdf) paper
|
||||
- the `optimum` [guide](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization) on GPTQ quantization
|
||||
- the [`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ) library used as the backend
|
||||
@ -48,6 +139,7 @@ Note that GPTQ integration supports for now only text models and you may encount
|
||||
GPTQ is a quantization method that requires weights calibration before using the quantized models. If you want to quantize transformers model from scratch, it might take some time before producing the quantized model (~5 min on a Google colab for `facebook/opt-350m` model).
|
||||
|
||||
Hence, there are two different scenarios where you want to use GPTQ-quantized models. The first use case would be to load models that has been already quantized by other users that are available on the Hub, the second use case would be to quantize your model from scratch and save it or push it on the Hub so that other users can also use it.
|
||||
|
||||
#### GPTQ Configuration
|
||||
|
||||
In order to load and quantize a model, you need to create a [`GPTQConfig`]. You need to pass the number of `bits`, a `dataset` in order to calibrate the quantization and the `tokenizer` of the model in order prepare the dataset.
|
||||
@ -59,6 +151,7 @@ gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
|
||||
```
|
||||
|
||||
Note that you can pass your own dataset as a list of string. However, it is highly recommended to use the dataset from the GPTQ paper.
|
||||
|
||||
```python
|
||||
dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
|
||||
quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
|
||||
@ -71,14 +164,17 @@ You can quantize a model by using `from_pretrained` and setting the `quantizatio
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config)
|
||||
|
||||
```
|
||||
Note that you will need a GPU to quantize a model. We will put the model in the cpu and move the modules back and forth to the gpu in order to quantize them.
|
||||
|
||||
If you want to maximize your gpus usage while using cpu offload, you can set `device_map = "auto"`.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
|
||||
```
|
||||
|
||||
Note that disk offload is not supported. Furthermore, if you are out of memory because of the dataset, you may have to pass `max_memory` in `from_pretained`. Checkout this [guide](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map) to learn more about `device_map` and `max_memory`.
|
||||
|
||||
<Tip warning={true}>
|
||||
@ -95,12 +191,14 @@ tokenizer.push_to_hub("opt-125m-gptq")
|
||||
```
|
||||
|
||||
If you want to save your quantized model on your local machine, you can also do it with `save_pretrained`:
|
||||
|
||||
```python
|
||||
quantized_model.save_pretrained("opt-125m-gptq")
|
||||
tokenizer.save_pretrained("opt-125m-gptq")
|
||||
```
|
||||
|
||||
Note that if you have quantized your model with a `device_map`, make sure to move the entire model to one of your gpus or the `cpu` before saving it.
|
||||
|
||||
```python
|
||||
quantized_model.to("cpu")
|
||||
quantized_model.save_pretrained("opt-125m-gptq")
|
||||
@ -117,6 +215,7 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq")
|
||||
```
|
||||
|
||||
If you want to load a model faster and without allocating more memory than needed, the `device_map` argument also works with quantized model. Make sure that you have `accelerate` library installed.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
|
||||
@ -124,16 +223,25 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de
|
||||
|
||||
### Exllama kernels for faster inference
|
||||
|
||||
For 4-bit model, you can use the exllama kernels in order to a faster inference speed. It is activated by default. You can change that behavior by passing `disable_exllama` in [`GPTQConfig`]. This will overwrite the quantization config stored in the config. Note that you will only be able to overwrite the attributes related to the kernels. Furthermore, you need to have the entire model on gpus if you want to use exllama kernels.
|
||||
For 4-bit model, you can use the exllama kernels in order to a faster inference speed. It is activated by default. You can change that behavior by passing `use_exllama` in [`GPTQConfig`]. This will overwrite the quantization config stored in the config. Note that you will only be able to overwrite the attributes related to the kernels. Furthermore, you need to have the entire model on gpus if you want to use exllama kernels. Also, you can perform CPU inference using Auto-GPTQ for Auto-GPTQ version > 0.4.2 by passing `device_map` = "cpu". For CPU inference, you have to pass `use_exllama = False` in the `GPTQConfig.`
|
||||
|
||||
```py
|
||||
import torch
|
||||
gptq_config = GPTQConfig(bits=4, disable_exllama=False)
|
||||
gptq_config = GPTQConfig(bits=4)
|
||||
model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
|
||||
```
|
||||
|
||||
With the release of the exllamav2 kernels, you can get faster inference speed compared to the exllama kernels. You just need to pass `exllama_config={"version": 2}` in [`GPTQConfig`]:
|
||||
|
||||
```py
|
||||
import torch
|
||||
gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
|
||||
model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config = gptq_config)
|
||||
```
|
||||
|
||||
Note that only 4-bit models are supported for now. Furthermore, it is recommended to deactivate the exllama kernels if you are finetuning a quantized model with peft.
|
||||
|
||||
You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
|
||||
#### Fine-tune a quantized model
|
||||
|
||||
With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ.
|
||||
@ -336,6 +444,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
|
||||
```
|
||||
|
||||
Note that in this case, you don't need to specify the arguments `load_in_8bit=True`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
|
||||
Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
|
||||
|
||||
@ -356,6 +465,7 @@ quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
|
||||
```
|
||||
|
||||
Let's say you want to load `bigscience/bloom-1b7` model, and you have just enough GPU RAM to fit the entire model except the `lm_head`. Therefore write a custom device_map as follows:
|
||||
|
||||
```python
|
||||
device_map = {
|
||||
"transformer.word_embeddings": 0,
|
||||
|
||||
@ -18,6 +18,12 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](https://github.com/huggingface/transformers/tree/main/examples).
|
||||
|
||||
<Tip>
|
||||
|
||||
If you're looking to fine-tune a language model like Llama-2 or Mistral on a text dataset using autoregressive techniques, consider using [`trl`](https://github.com/huggingface/trl)'s [`~trl.SFTTrainer`]. The [`~trl.SFTTrainer`] wraps the [`Trainer`] and is specially optimized for this particular task and supports sequence packing, LoRA, quantization, and DeepSpeed for efficient scaling to any model size. On the other hand, the [`Trainer`] is a more versatile option, suitable for a broader spectrum of tasks.
|
||||
|
||||
</Tip>
|
||||
|
||||
Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.
|
||||
|
||||
The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex](https://github.com/NVIDIA/apex) and Native AMP for PyTorch.
|
||||
@ -204,6 +210,7 @@ python -m torch.distributed.launch --nproc_per_node=2 trainer-program.py ...
|
||||
```
|
||||
|
||||
if you have either [`accelerate`](https://github.com/huggingface/accelerate) or [`deepspeed`](https://github.com/microsoft/DeepSpeed) installed you can also accomplish the same by using one of:
|
||||
|
||||
```bash
|
||||
accelerate launch --num_processes 2 trainer-program.py ...
|
||||
```
|
||||
@ -240,6 +247,7 @@ CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py .
|
||||
Here your physical GPUs 0 and 2 are mapped to `cuda:1` and `cuda:0` correspondingly.
|
||||
|
||||
The above examples were all for `DistributedDataParallel` use pattern, but the same method works for [`DataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) as well:
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=2,0 python trainer-program.py ...
|
||||
```
|
||||
@ -732,3 +740,27 @@ Sections that were moved:
|
||||
| <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
|
||||
| <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
|
||||
]
|
||||
|
||||
## Boost your fine-tuning performances using NEFTune
|
||||
|
||||
|
||||
NEFTune is a technique to boost the performance of chat models and was introduced by the paper тАЬNEFTune: Noisy Embeddings Improve Instruction FinetuningтАЭ from Jain et al. it consists of adding noise to the embedding vectors during training. According to the abstract of the paper:
|
||||
|
||||
> Standard finetuning of LLaMA-2-7B using Alpaca achieves 29.79% on AlpacaEval, which rises to 64.69% using noisy embeddings. NEFTune also improves over strong baselines on modern instruction datasets. Models trained with Evol-Instruct see a 10% improvement, with ShareGPT an 8% improvement, and with OpenPlatypus an 8% improvement. Even powerful models further refined with RLHF such as LLaMA-2-Chat benefit from additional training with NEFTune.
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/neft-screenshot.png">
|
||||
</div>
|
||||
|
||||
To use it in `Trainer` simply pass `neftune_noise_alpha` when creating your `TrainingArguments` instance. Note that to avoid any surprising behaviour, NEFTune is disabled after training to retrieve back the original behaviour of the embedding layer.
|
||||
|
||||
```python
|
||||
from transformers import Trainer, TrainingArguments
|
||||
|
||||
args = TrainingArguments(..., neftune_noise_alpha=0.1)
|
||||
trainer = Trainer(..., args=args)
|
||||
|
||||
...
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
@ -266,6 +266,10 @@ The following auto classes are available for the following computer vision tasks
|
||||
|
||||
[[autodoc]] AutoModelForImageSegmentation
|
||||
|
||||
### AutoModelForImageToImage
|
||||
|
||||
[[autodoc]] AutoModelForImageToImage
|
||||
|
||||
### AutoModelForSemanticSegmentation
|
||||
|
||||
[[autodoc]] AutoModelForSemanticSegmentation
|
||||
|
||||
@ -64,7 +64,7 @@ model.enable_cpu_offload()
|
||||
|
||||
Note that ЁЯдЧ Accelerate must be installed before using this feature. [Here's how to install it.](https://huggingface.co/docs/accelerate/basic_tutorials/install)
|
||||
|
||||
#### Combining optimizaton techniques
|
||||
#### Combining optimization techniques
|
||||
|
||||
You can combine optimization techniques, and use CPU offload, half-precision and ЁЯдЧ Better Transformer all at once.
|
||||
|
||||
|
||||
@ -83,8 +83,23 @@ This model was contributed by [valhalla](https://huggingface.co/valhalla). The o
|
||||
|
||||
A list of official Hugging Face and community (indicated by ЁЯМО) resources to help you get started with CLIP.
|
||||
|
||||
- A blog post on [How to fine-tune CLIP on 10,000 image-text pairs](https://huggingface.co/blog/fine-tune-clip-rsicd).
|
||||
- CLIP is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text).
|
||||
- [Fine tuning CLIP with Remote Sensing (Satellite) images and captions](https://huggingface.co/blog/fine-tune-clip-rsicd), a blog post about how to fine-tune CLIP with [RSICD dataset](https://github.com/201528014227051/RSICD_optimal) and comparison of performance changes due to data augmentation.
|
||||
- This [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text) shows how to train a CLIP-like vision-text dual encoder model using a pre-trained vision and text encoder using [COCO dataset](https://cocodataset.org/#home).
|
||||
|
||||
<PipelineTag pipeline="image-to-text"/>
|
||||
|
||||
- A [notebook](https://colab.research.google.com/drive/1tuoAC5F4sC7qid56Z0ap-stR3rwdk0ZV?usp=sharing) on how to use a pretrained CLIP for inference with beam search for image captioning. ЁЯМО
|
||||
|
||||
**Image retrieval**
|
||||
|
||||
- A [notebook](https://colab.research.google.com/drive/1bLVwVKpAndpEDHqjzxVPr_9nGrSbuOQd?usp=sharing) on image retrieval using pretrained CLIP and computing MRR(Mean Reciprocal Rank) score. ЁЯМО
|
||||
- A [notebook](https://colab.research.google.com/github/deep-diver/image_search_with_natural_language/blob/main/notebooks/Image_Search_CLIP.ipynb) on image retrieval and showing the similarity score. ЁЯМО
|
||||
- A [notebook](https://colab.research.google.com/drive/1xO-wC_m_GNzgjIBQ4a4znvQkvDoZJvH4?usp=sharing) on how to map images and texts to the same vector space using Multilingual CLIP. ЁЯМО
|
||||
- A [notebook](https://colab.research.google.com/github/vivien000/clip-demo/blob/master/clip.ipynb#scrollTo=uzdFhRGqiWkR) on how to run CLIP on semantic image search using [Unsplash](https://unsplash.com) and [TMBD](https://www.themoviedb.org/) datasets. ЁЯМО
|
||||
|
||||
**Explainability**
|
||||
|
||||
- A [notebook](https://colab.research.google.com/github/hila-chefer/Transformer-MM-Explainability/blob/main/CLIP_explainability.ipynb) on how to visualize similarity between input token and image segment. ЁЯМО
|
||||
|
||||
If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
|
||||
The resource should ideally demonstrate something new instead of duplicating an existing resource.
|
||||
|
||||
@ -59,3 +59,14 @@ If you're interested in submitting a resource to be included here, please feel f
|
||||
|
||||
[[autodoc]] ConvNextV2ForImageClassification
|
||||
- forward
|
||||
|
||||
## TFConvNextV2Model
|
||||
|
||||
[[autodoc]] TFConvNextV2Model
|
||||
- call
|
||||
|
||||
|
||||
## TFConvNextV2ForImageClassification
|
||||
|
||||
[[autodoc]] TFConvNextV2ForImageClassification
|
||||
- call
|
||||
|
||||
@ -19,10 +19,10 @@ rendered properly in your Markdown viewer.
|
||||
## Overview
|
||||
|
||||
Flan-UL2 is an encoder decoder model based on the T5 architecture. It uses the same configuration as the [UL2](ul2) model released earlier last year.
|
||||
It was fine tuned using the "Flan" prompt tuning and dataset collection. Similiar to `Flan-T5`, one can directly use FLAN-UL2 weights without finetuning the model:
|
||||
It was fine tuned using the "Flan" prompt tuning and dataset collection. Similar to `Flan-T5`, one can directly use FLAN-UL2 weights without finetuning the model:
|
||||
|
||||
|
||||
According ot the original blog here are the notable improvements:
|
||||
According to the original blog here are the notable improvements:
|
||||
|
||||
- The original UL2 model was only trained with receptive field of 512, which made it non-ideal for N-shot prompting where N is large.
|
||||
- The Flan-UL2 checkpoint uses a receptive field of 2048 which makes it more usable for few-shot in-context learning.
|
||||
@ -53,4 +53,4 @@ The model is pretty heavy (~40GB in half precision) so if you just want to run t
|
||||
|
||||
## Inference
|
||||
|
||||
The inference protocol is exaclty the same as any `T5` model, please have a look at the [T5's documentation page](t5) for more details.
|
||||
The inference protocol is exactly the same as any `T5` model, please have a look at the [T5's documentation page](t5) for more details.
|
||||
|
||||
115
docs/source/en/model_doc/fuyu.md
Normal file
115
docs/source/en/model_doc/fuyu.md
Normal file
@ -0,0 +1,115 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Fuyu
|
||||
|
||||
## Overview
|
||||
|
||||
The Fuyu model was created by [ADEPT](https://www.adept.ai/blog/fuyu-8b), and authored by Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sa─Яnak Ta┼Я─▒rlar.
|
||||
|
||||
The authors introduced Fuyu-8B, a decoder-only multimodal model based on the classic transformers architecture, with query and key normalization. A linear encoder is added to create multimodal embeddings from image inputs.
|
||||
|
||||
By treating image tokens like text tokens and using a special image-newline character, the model knows when an image line ends. Image positional embeddings are removed. This avoids the need for different training phases for various image resolutions. With 8 billion parameters and licensed under CC-BY-NC, Fuyu-8B is notable for its ability to handle both text and images, its impressive context size of 16K, and its overall performance.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The `Fuyu` models were trained using `bfloat16`, but the original inference uses `float16` The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be
|
||||
used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`.
|
||||
|
||||
The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be cast to the default `dtype` of `torch` (becomes `torch.float32`). Users should specify the `torch_dtype` they want, and if they don't it will be `torch.float32`.
|
||||
|
||||
Finetuning the model in `float16` is not recommended and known to produce `nan`, as such the model should be fine-tuned in `bfloat16`.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
Tips:
|
||||
|
||||
- To convert the model, you need to clone the original repository using `git clone https://github.com/persimmon-ai-labs/adept-inference`, then get the checkpoints:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/persimmon-ai-labs/adept-inference
|
||||
wget path/to/fuyu-8b-model-weights.tar
|
||||
tar -xvf fuyu-8b-model-weights.tar
|
||||
python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path \
|
||||
--pt_model_path /path/to/fuyu_8b_release/iter_0001251/mp_rank_00/model_optim_rng.pt
|
||||
--ada_lib_path /path/to/adept-inference
|
||||
```
|
||||
|
||||
For the chat model:
|
||||
```bash
|
||||
wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
|
||||
tar -xvf 8b_base_model_release.tar
|
||||
```
|
||||
Then, model can be loaded via:
|
||||
|
||||
```py
|
||||
from transformers import FuyuConfig, FuyuForCausalLM
|
||||
model_config = FuyuConfig()
|
||||
model = FuyuForCausalLM(model_config).from_pretrained('/output/path')
|
||||
```
|
||||
|
||||
Inputs need to be passed through a specific Processor to have the correct formats.
|
||||
A processor requires an image_processor and a tokenizer. Hence, inputs can be loaded via:
|
||||
|
||||
```py
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.models.fuyu.processing_fuyu import FuyuProcessor
|
||||
from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
|
||||
image_processor = FuyuImageProcessor()
|
||||
|
||||
|
||||
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
text_prompt = "Generate a coco-style caption.\\n"
|
||||
|
||||
bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
|
||||
bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
|
||||
inputs_to_model = processor(text=text_prompt, images=image_pil)
|
||||
|
||||
|
||||
```
|
||||
|
||||
This model was contributed by [Molbap](https://huggingface.co/Molbap).
|
||||
The original code can be found [here](https://github.com/persimmon-ai-labs/adept-inference).
|
||||
|
||||
- Fuyu uses a `sentencepiece` based tokenizer, with a `Unigram` model. It supports bytefallback, which is only available in `tokenizers==0.14.0` for the fast tokenizer.
|
||||
The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
|
||||
|
||||
- The authors suggest to use the following prompt for image captioning: `f"Generate a coco-style caption.\\n"`
|
||||
|
||||
|
||||
## FuyuConfig
|
||||
|
||||
[[autodoc]] FuyuConfig
|
||||
|
||||
## FuyuForCausalLM
|
||||
|
||||
[[autodoc]] FuyuForCausalLM
|
||||
- forward
|
||||
|
||||
## FuyuImageProcessor
|
||||
|
||||
[[autodoc]] FuyuImageProcessor
|
||||
- __call__
|
||||
|
||||
## FuyuProcessor
|
||||
|
||||
[[autodoc]] FuyuProcessor
|
||||
- __call__
|
||||
@ -42,6 +42,45 @@ The main differences compared to GPT2.
|
||||
|
||||
You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)
|
||||
|
||||
## Combining Starcoder and Flash Attention 2
|
||||
|
||||
First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
|
||||
|
||||
```bash
|
||||
pip install -U flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
|
||||
|
||||
To load and run a model using Flash Attention 2, refer to the snippet below:
|
||||
|
||||
```python
|
||||
>>> import torch
|
||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
>>> device = "cuda" # the device to load the model onto
|
||||
|
||||
>>> model = AutoModelForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder", torch_dtype=torch.float16, use_flash_attention_2=True)
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bigcode/gpt_bigcode-santacoder")
|
||||
|
||||
>>> prompt = "def hello_world():"
|
||||
|
||||
>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
|
||||
>>> model.to(device)
|
||||
|
||||
>>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
|
||||
>>> tokenizer.batch_decode(generated_ids)[0]
|
||||
'def hello_world():\n print("hello world")\n\nif __name__ == "__main__":\n print("hello world")\n<|endoftext|>'
|
||||
```
|
||||
|
||||
### Expected speedups
|
||||
|
||||
Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using `bigcode/starcoder` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/starcoder-speedup.png">
|
||||
</div>
|
||||
|
||||
|
||||
## GPTBigCodeConfig
|
||||
|
||||
[[autodoc]] GPTBigCodeConfig
|
||||
|
||||
@ -28,7 +28,7 @@ The abstract from the paper is the following:
|
||||
|
||||
As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length.
|
||||
First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditionner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution.
|
||||
The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data. The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
|
||||
The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data. The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
|
||||
|
||||

|
||||
|
||||
@ -36,7 +36,7 @@ Tips:
|
||||
- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face traineer!
|
||||
- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
|
||||
- Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
|
||||
- Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
|
||||
- Primed sampling (conditioning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
|
||||
|
||||
This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
|
||||
The original code can be found [here](https://github.com/openai/jukebox).
|
||||
|
||||
98
docs/source/en/model_doc/kosmos-2.md
Normal file
98
docs/source/en/model_doc/kosmos-2.md
Normal file
@ -0,0 +1,98 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# KOSMOS-2
|
||||
|
||||
## Overview
|
||||
|
||||
The KOSMOS-2 model was proposed in [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
|
||||
|
||||
KOSMOS-2 is a Transformer-based causal language model and is trained using the next-word prediction task on a web-scale
|
||||
dataset of grounded image-text pairs [GRIT](https://huggingface.co/datasets/zzliang/GRIT). The spatial coordinates of
|
||||
the bounding boxes in the dataset are converted to a sequence of location tokens, which are appended to their respective
|
||||
entity text spans (for example, `a snowman` followed by `<patch_index_0044><patch_index_0863>`). The data format is
|
||||
similar to тАЬhyperlinksтАЭ that connect the object regions in an image to their text span in the corresponding caption.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We introduce Kosmos-2, a Multimodal Large Language Model (MLLM), enabling new capabilities of perceiving object descriptions (e.g., bounding boxes) and grounding text to the visual world. Specifically, we represent refer expressions as links in Markdown, i.e., ``[text span](bounding boxes)'', where object descriptions are sequences of location tokens. Together with multimodal corpora, we construct large-scale data of grounded image-text pairs (called GrIT) to train the model. In addition to the existing capabilities of MLLMs (e.g., perceiving general modalities, following instructions, and performing in-context learning), Kosmos-2 integrates the grounding capability into downstream applications. We evaluate Kosmos-2 on a wide range of tasks, including (i) multimodal grounding, such as referring expression comprehension, and phrase grounding, (ii) multimodal referring, such as referring expression generation, (iii) perception-language tasks, and (iv) language understanding and generation. This work lays out the foundation for the development of Embodiment AI and sheds light on the big convergence of language, multimodal perception, action, and world modeling, which is a key step toward artificial general intelligence. Code and pretrained models are available at https://aka.ms/kosmos-2.*
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos_2_overview.jpg"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> Overview of tasks that KOSMOS-2 can handle. Taken from the <a href="https://arxiv.org/abs/2306.14824">original paper</a>. </small>
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
>>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
|
||||
|
||||
>>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
|
||||
>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
|
||||
|
||||
>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> prompt = "<grounding> An image of"
|
||||
|
||||
>>> inputs = processor(text=prompt, images=image, return_tensors="pt")
|
||||
|
||||
>>> generated_ids = model.generate(
|
||||
... pixel_values=inputs["pixel_values"],
|
||||
... input_ids=inputs["input_ids"],
|
||||
... attention_mask=inputs["attention_mask"],
|
||||
... image_embeds=None,
|
||||
... image_embeds_position_mask=inputs["image_embeds_position_mask"],
|
||||
... use_cache=True,
|
||||
... max_new_tokens=64,
|
||||
... )
|
||||
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
>>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
|
||||
>>> processed_text
|
||||
'<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'
|
||||
|
||||
>>> caption, entities = processor.post_process_generation(generated_text)
|
||||
>>> caption
|
||||
'An image of a snowman warming himself by a fire.'
|
||||
|
||||
>>> entities
|
||||
[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
|
||||
```
|
||||
|
||||
This model was contributed by [Yih-Dar SHIEH](https://huggingface.co/ydshieh). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/kosmos-2).
|
||||
|
||||
## Kosmos2Config
|
||||
|
||||
[[autodoc]] Kosmos2Config
|
||||
|
||||
## Kosmos2ImageProcessor
|
||||
|
||||
## Kosmos2Processor
|
||||
|
||||
[[autodoc]] Kosmos2Processor
|
||||
- __call__
|
||||
|
||||
## Kosmos2Model
|
||||
|
||||
[[autodoc]] Kosmos2Model
|
||||
- forward
|
||||
|
||||
## Kosmos2ForConditionalGeneration
|
||||
|
||||
[[autodoc]] Kosmos2ForConditionalGeneration
|
||||
- forward
|
||||
@ -28,12 +28,12 @@ Checkout all Llama2 models [here](https://huggingface.co/models?search=llama2)
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The `Llama2` models were trained using `bfloat16`, but the original inference uses `float16. The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be
|
||||
The `Llama2` models were trained using `bfloat16`, but the original inference uses `float16`. The checkpoints uploaded on the Hub use `torch_dtype = 'float16'`, which will be
|
||||
used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`.
|
||||
|
||||
The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`) and finally, if there is a `torch_dtype` provided in the config, it will be used.
|
||||
The `dtype` of the online weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online), then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`), and finally, if there is a `torch_dtype` provided in the config, it will be used.
|
||||
|
||||
Training the model in `float16` is not recommended and known to produce `nan`, as such the model should be trained in `bfloat16`.
|
||||
Training the model in `float16` is not recommended and is known to produce `nan`; as such, the model should be trained in `bfloat16`.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
151
docs/source/en/model_doc/mistral.md
Normal file
151
docs/source/en/model_doc/mistral.md
Normal file
@ -0,0 +1,151 @@
|
||||
<!--Copyright 2023 Mistral AI and The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Mistral
|
||||
|
||||
## Overview
|
||||
|
||||
Mistral-7B-v0.1 is Mistral AIтАЩs first Large Language Model (LLM).
|
||||
|
||||
## Model Details
|
||||
|
||||
Mistral-7B-v0.1 is a decoder-based LM with the following architectural choices:
|
||||
* Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens
|
||||
* GQA (Grouped Query Attention) - allowing faster inference and lower cache size.
|
||||
* Byte-fallback BPE tokenizer - ensures that characters are never mapped to out of vocabulary tokens.
|
||||
|
||||
We also provide an instruction fine-tuned model: `Mistral-7B-Instruct-v0.1` which can be used for chat-based inference.
|
||||
|
||||
For more details please read our [release blog post](https://mistral.ai/news/announcing-mistral-7b/)
|
||||
|
||||
## License
|
||||
|
||||
Both `Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` are released under the Apache 2.0 license.
|
||||
|
||||
## Usage
|
||||
|
||||
`Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` can be found on the [Huggingface Hub](https://huggingface.co/mistralai)
|
||||
|
||||
These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hub:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
>>> device = "cuda" # the device to load the model onto
|
||||
|
||||
>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
|
||||
>>> prompt = "My favourite condiment is"
|
||||
|
||||
>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
|
||||
>>> model.to(device)
|
||||
|
||||
>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
|
||||
>>> tokenizer.batch_decode(generated_ids)[0]
|
||||
"The expected output"
|
||||
```
|
||||
|
||||
Raw weights for `Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` can be downloaded from:
|
||||
|
||||
| Model Name | Checkpoint |
|
||||
|----------------------------|-----------------------------------------------------------------------------------------|
|
||||
| `Mistral-7B-v0.1` | [Raw Checkpoint](https://files.mistral-7b-v0-1.mistral.ai/mistral-7B-v0.1.tar) |
|
||||
| `Mistral-7B-Instruct-v0.1` | [Raw Checkpoint](https://files.mistral-7b-v0-1.mistral.ai/mistral-7B-instruct-v0.1.tar) |
|
||||
|
||||
|
||||
To use these raw checkpoints with HuggingFace you can use the `convert_mistral_weights_to_hf.py` script to convert them to the HuggingFace format:
|
||||
|
||||
```bash
|
||||
python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \
|
||||
--input_dir /path/to/downloaded/mistral/weights --model_size 7B --output_dir /output/path
|
||||
```
|
||||
|
||||
You can then load the converted model from the `output/path`:
|
||||
|
||||
```python
|
||||
from transformers import MistralForCausalLM, LlamaTokenizer
|
||||
|
||||
tokenizer = LlamaTokenizer.from_pretrained("/output/path")
|
||||
model = MistralForCausalLM.from_pretrained("/output/path")
|
||||
```
|
||||
|
||||
## Combining Mistral and Flash Attention 2
|
||||
|
||||
First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
|
||||
|
||||
```bash
|
||||
pip install -U flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of [`flash-attn`](https://github.com/Dao-AILab/flash-attention) repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)
|
||||
|
||||
To load and run a model using Flash Attention 2, refer to the snippet below:
|
||||
|
||||
```python
|
||||
>>> import torch
|
||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
>>> device = "cuda" # the device to load the model onto
|
||||
|
||||
>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, use_flash_attention_2=True)
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
|
||||
>>> prompt = "My favourite condiment is"
|
||||
|
||||
>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
|
||||
>>> model.to(device)
|
||||
|
||||
>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
|
||||
>>> tokenizer.batch_decode(generated_ids)[0]
|
||||
"The expected output"
|
||||
```
|
||||
|
||||
### Expected speedups
|
||||
|
||||
Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using `mistralai/Mistral-7B-v0.1` checkpoint and the Flash Attention 2 version of the model.
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/mistral-7b-inference-large-seqlen.png">
|
||||
</div>
|
||||
|
||||
### Sliding window Attention
|
||||
|
||||
The current implementation supports the sliding window attention mechanism and memory efficient cache management.
|
||||
To enable sliding window attention, just make sure to have a `flash-attn` version that is compatible with sliding window attention (`>=2.3.0`).
|
||||
|
||||
The Flash Attention-2 model uses also a more memory efficient cache slicing mechanism - as recommended per the official implementation of Mistral model that use rolling cache mechanism we keep the cache size fixed (`self.config.sliding_window`), support batched generation only for `padding_side="left"` and use the absolute position of the current token to compute the positional embedding.
|
||||
|
||||
## The Mistral Team
|
||||
|
||||
Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, L├йlio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timoth├йe Lacroix, William El Sayed.
|
||||
|
||||
## MistralConfig
|
||||
|
||||
[[autodoc]] MistralConfig
|
||||
|
||||
## MistralModel
|
||||
|
||||
[[autodoc]] MistralModel
|
||||
- forward
|
||||
|
||||
## MistralForCausalLM
|
||||
|
||||
[[autodoc]] MistralForCausalLM
|
||||
- forward
|
||||
|
||||
## MistralForSequenceClassification
|
||||
|
||||
[[autodoc]] MistralForSequenceClassification
|
||||
- forward
|
||||
@ -22,7 +22,7 @@ The MRA model was proposed in [Multi Resolution Analysis (MRA) for Approximate S
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Transformers have emerged as a preferred model for many tasks in natural langugage processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at https://github.com/mlpen/mra-attention.*
|
||||
*Transformers have emerged as a preferred model for many tasks in natural language processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at https://github.com/mlpen/mra-attention.*
|
||||
|
||||
This model was contributed by [novice03](https://huggingface.co/novice03).
|
||||
The original code can be found [here](https://github.com/mlpen/mra-attention).
|
||||
|
||||
@ -53,7 +53,7 @@ which means that tokens have less probability of being forwarded. Moreover, if a
|
||||
states (kind of like a residual connection) while they are masked in `NLLB`'s top-2 routing mechanism.
|
||||
|
||||
## Generating with NLLB-MoE
|
||||
The avalable checkpoints requires around 350GB of storage. Make sure to use `accelerate` if you do not have enough RAM on your machine.
|
||||
The available checkpoints require around 350GB of storage. Make sure to use `accelerate` if you do not have enough RAM on your machine.
|
||||
|
||||
While generating the target text set the `forced_bos_token_id` to the target language id. The following
|
||||
example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model.
|
||||
|
||||
109
docs/source/en/model_doc/nougat.md
Normal file
109
docs/source/en/model_doc/nougat.md
Normal file
@ -0,0 +1,109 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# Nougat
|
||||
|
||||
## Overview
|
||||
|
||||
The Nougat model was proposed in [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by
|
||||
Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. Nougat uses the same architecture as [Donut](donut), meaning an image Transformer
|
||||
encoder and an autoregressive text Transformer decoder to translate scientific PDFs to markdown, enabling easier access to them.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Scientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (Neural Optical Understanding for Academic Documents), a Visual Transformer model that performs an Optical Character Recognition (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.*
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/nougat_architecture.jpg"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> Nougat high-level overview. Taken from the <a href="https://arxiv.org/abs/2308.13418">original paper</a>. </small>
|
||||
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found
|
||||
[here](https://github.com/facebookresearch/nougat).
|
||||
|
||||
Tips:
|
||||
|
||||
- The quickest way to get started with Nougat is by checking the [tutorial
|
||||
notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Nougat), which show how to use the model
|
||||
at inference time as well as fine-tuning on custom data.
|
||||
- Nougat is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework. The model is identical to [Donut](donut) in terms of architecture.
|
||||
|
||||
## Inference
|
||||
|
||||
Nougat's [`VisionEncoderDecoder`] model accepts images as input and makes use of
|
||||
[`~generation.GenerationMixin.generate`] to autoregressively generate text given the input image.
|
||||
|
||||
The [`NougatImageProcessor`] class is responsible for preprocessing the input image and
|
||||
[`NougatTokenizerFast`] decodes the generated target tokens to the target string. The
|
||||
[`NougatProcessor`] wraps [`NougatImageProcessor`] and [`NougatTokenizerFast`] classes
|
||||
into a single instance to both extract the input features and decode the predicted token ids.
|
||||
|
||||
- Step-by-step PDF transcription
|
||||
|
||||
```py
|
||||
>>> from huggingface_hub import hf_hub_download
|
||||
>>> import re
|
||||
>>> from PIL import Image
|
||||
|
||||
>>> from transformers import NougatProcessor, VisionEncoderDecoderModel
|
||||
>>> from datasets import load_dataset
|
||||
>>> import torch
|
||||
|
||||
>>> processor = NougatProcessor.from_pretrained("facebook/nougat-base")
|
||||
>>> model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")
|
||||
|
||||
>>> device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
>>> model.to(device) # doctest: +IGNORE_RESULT
|
||||
|
||||
>>> # prepare PDF image for the model
|
||||
>>> filepath = hf_hub_download(repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_paper.png", repo_type="dataset")
|
||||
>>> image = Image.open(filepath)
|
||||
>>> pixel_values = processor(image, return_tensors="pt").pixel_values
|
||||
|
||||
>>> # generate transcription (here we only generate 30 tokens)
|
||||
>>> outputs = model.generate(
|
||||
... pixel_values.to(device),
|
||||
... min_length=1,
|
||||
... max_new_tokens=30,
|
||||
... bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
||||
... )
|
||||
|
||||
>>> sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
||||
>>> sequence = processor.post_process_generation(sequence, fix_markdown=False)
|
||||
>>> # note: we're using repr here such for the sake of printing the \n characters, feel free to just print the sequence
|
||||
>>> print(repr(sequence))
|
||||
'\n\n# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@'
|
||||
```
|
||||
|
||||
See the [model hub](https://huggingface.co/models?filter=nougat) to look for Nougat checkpoints.
|
||||
|
||||
## NougatImageProcessor
|
||||
|
||||
[[autodoc]] NougatImageProcessor
|
||||
- preprocess
|
||||
|
||||
## NougatTokenizerFast
|
||||
|
||||
[[autodoc]] NougatTokenizerFast
|
||||
|
||||
## NougatProcessor
|
||||
|
||||
[[autodoc]] NougatProcessor
|
||||
- __call__
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
- batch_decode
|
||||
- decode
|
||||
- post_process_generation
|
||||
@ -33,15 +33,13 @@ This model differs from the [OpenLLaMA models](https://huggingface.co/models?sea
|
||||
|
||||
## Overview
|
||||
|
||||
The Open-Llama model was proposed in [Open-Llama project](https://github.com/s-JoL/Open-Llama) by community developer s-JoL.
|
||||
The Open-Llama model was proposed in the open source Open-Llama project by community developer s-JoL.
|
||||
|
||||
The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.
|
||||
And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
|
||||
|
||||
This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
|
||||
The original code can be found [Open-Llama](https://github.com/s-JoL/Open-Llama).
|
||||
Checkpoint and usage can be found at [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
|
||||
|
||||
The original code was released on GitHub by [s-JoL](https://github.com/s-JoL), but is now removed.
|
||||
|
||||
## OpenLlamaConfig
|
||||
|
||||
|
||||
126
docs/source/en/model_doc/owlv2.md
Normal file
126
docs/source/en/model_doc/owlv2.md
Normal file
@ -0,0 +1,126 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# OWLv2
|
||||
|
||||
## Overview
|
||||
|
||||
OWLv2 was proposed in [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. OWLv2 scales up [OWL-ViT](owlvit) using self-training, which uses an existing detector to generate pseudo-box annotations on image-text pairs. This results in large gains over the previous state-of-the-art for zero-shot object detection.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Open-vocabulary object detection has benefited greatly from pretrained vision-language models, but is still limited by the amount of available detection training data. While detection training data can be expanded by using Web image-text pairs as weak supervision, this has not been done at scales comparable to image-level pretraining. Here, we scale up detection data with self-training, which uses an existing detector to generate pseudo-box annotations on image-text pairs. Major challenges in scaling self-training are the choice of label space, pseudo-annotation filtering, and training efficiency. We present the OWLv2 model and OWL-ST self-training recipe, which address these challenges. OWLv2 surpasses the performance of previous state-of-the-art open-vocabulary detectors already at comparable training scales (~10M examples). However, with OWL-ST, we can scale to over 1B examples, yielding further large improvement: With an L/14 architecture, OWL-ST improves AP on LVIS rare classes, for which the model has seen no human box annotations, from 31.2% to 44.6% (43% relative improvement). OWL-ST unlocks Web-scale training for open-world localization, similar to what has been seen for image classification and language modelling.*
|
||||
|
||||
Tips:
|
||||
|
||||
- The architecture of OWLv2 is identical to [OWL-ViT](owlvit), however the object detection head now also includes an objectness classifier, which predicts the (query-agnostic) likelihood that a predicted box contains an object (as opposed to background). The objectness score can be used to rank or filter predictions independently of text queries.
|
||||
- Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image processor ([`Owlv2ImageProcessor`]).
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/owlv2_overview.png"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> OWLv2 high-level overview. Taken from the <a href="https://arxiv.org/abs/2306.09683">original paper</a>. </small>
|
||||
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr).
|
||||
The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
|
||||
|
||||
## Usage
|
||||
|
||||
OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
|
||||
|
||||
[`Owlv2ImageProcessor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`Owlv2Processor`] wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`Owlv2Processor`] and [`Owlv2ForObjectDetection`].
|
||||
|
||||
|
||||
```python
|
||||
>>> import requests
|
||||
>>> from PIL import Image
|
||||
>>> import torch
|
||||
|
||||
>>> from transformers import Owlv2Processor, Owlv2ForObjectDetection
|
||||
|
||||
>>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
||||
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
|
||||
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
>>> texts = [["a photo of a cat", "a photo of a dog"]]
|
||||
>>> inputs = processor(text=texts, images=image, return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
|
||||
>>> # Convert outputs (bounding boxes and class logits) to COCO API
|
||||
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
|
||||
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
||||
>>> text = texts[i]
|
||||
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
|
||||
>>> for box, score, label in zip(boxes, scores, labels):
|
||||
... box = [round(i, 2) for i in box.tolist()]
|
||||
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
|
||||
Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.32, 278.51]
|
||||
Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85]
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
A demo notebook on using OWLv2 for zero- and one-shot (image-guided) object detection can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/OWLv2).
|
||||
|
||||
## Owlv2Config
|
||||
|
||||
[[autodoc]] Owlv2Config
|
||||
- from_text_vision_configs
|
||||
|
||||
## Owlv2TextConfig
|
||||
|
||||
[[autodoc]] Owlv2TextConfig
|
||||
|
||||
## Owlv2VisionConfig
|
||||
|
||||
[[autodoc]] Owlv2VisionConfig
|
||||
|
||||
## Owlv2ImageProcessor
|
||||
|
||||
[[autodoc]] Owlv2ImageProcessor
|
||||
- preprocess
|
||||
- post_process_object_detection
|
||||
- post_process_image_guided_detection
|
||||
|
||||
## Owlv2Processor
|
||||
|
||||
[[autodoc]] Owlv2Processor
|
||||
|
||||
## Owlv2Model
|
||||
|
||||
[[autodoc]] Owlv2Model
|
||||
- forward
|
||||
- get_text_features
|
||||
- get_image_features
|
||||
|
||||
## Owlv2TextModel
|
||||
|
||||
[[autodoc]] Owlv2TextModel
|
||||
- forward
|
||||
|
||||
## Owlv2VisionModel
|
||||
|
||||
[[autodoc]] Owlv2VisionModel
|
||||
- forward
|
||||
|
||||
## Owlv2ForObjectDetection
|
||||
|
||||
[[autodoc]] Owlv2ForObjectDetection
|
||||
- forward
|
||||
- image_guided_detection
|
||||
@ -24,6 +24,13 @@ The abstract from the paper is the following:
|
||||
|
||||
*Combining simple architectures with large-scale pre-training has led to massive improvements in image classification. For object detection, pre-training and scaling approaches are less well established, especially in the long-tailed and open-vocabulary setting, where training data is relatively scarce. In this paper, we propose a strong recipe for transferring image-text models to open-vocabulary object detection. We use a standard Vision Transformer architecture with minimal modifications, contrastive image-text pre-training, and end-to-end detection fine-tuning. Our analysis of the scaling properties of this setup shows that increasing image-level pre-training and model size yield consistent improvements on the downstream detection task. We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. Code and models are available on GitHub.*
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/owlvit_architecture.jpg"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> OWL-ViT architecture. Taken from the <a href="https://arxiv.org/abs/2205.06230">original paper</a>. </small>
|
||||
|
||||
This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
|
||||
|
||||
## Usage
|
||||
|
||||
OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
|
||||
@ -61,7 +68,9 @@ Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.
|
||||
Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
|
||||
```
|
||||
|
||||
This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
|
||||
## Resources
|
||||
|
||||
A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object detection can be found [here](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb).
|
||||
|
||||
## OwlViTConfig
|
||||
|
||||
|
||||
218
docs/source/en/model_doc/seamless_m4t.md
Normal file
218
docs/source/en/model_doc/seamless_m4t.md
Normal file
@ -0,0 +1,218 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# SeamlessM4T
|
||||
|
||||
## Overview
|
||||
|
||||
The SeamlessM4T model was proposed in [SeamlessM4T тАФ Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team from Meta AI.
|
||||
|
||||
SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
||||
|
||||
SeamlessM4T enables multiple tasks without relying on separate models:
|
||||
|
||||
- Speech-to-speech translation (S2ST)
|
||||
- Speech-to-text translation (S2TT)
|
||||
- Text-to-speech translation (T2ST)
|
||||
- Text-to-text translation (T2TT)
|
||||
- Automatic speech recognition (ASR)
|
||||
|
||||
[`SeamlessM4TModel`] can perform all the above tasks, but each task also has its own dedicated sub-model.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*What does it take to create the Babel Fish, a tool that can help individuals translate speech between any two languages? While recent breakthroughs in text-based models have pushed machine translation coverage beyond 200 languages, unified speech-to-speech translation models have yet to achieve similar strides. More specifically, conventional speech-to-speech translation systems rely on cascaded systems that perform translation progressively, putting high-performing unified systems out of reach. To address these gaps, we introduce SeamlessM4T, a single model that supports speech-to-speech translation, speech-to-text translation, text-to-speech translation, text-to-text translation, and automatic speech recognition for up to 100 languages. To build this, we used 1 million hours of open speech audio data to learn self-supervised speech representations with w2v-BERT 2.0. Subsequently, we created a multimodal corpus of automatically aligned speech translations. Filtered and combined with human-labeled and pseudo-labeled data, we developed the first multilingual system capable of translating from and into English for both speech and text. On FLEURS, SeamlessM4T sets a new standard for translations into multiple target languages, achieving an improvement of 20% BLEU over the previous SOTA in direct speech-to-text translation. Compared to strong cascaded models, SeamlessM4T improves the quality of into-English translation by 1.3 BLEU points in speech-to-text and by 2.6 ASR-BLEU points in speech-to-speech. Tested for robustness, our system performs better against background noises and speaker variations in speech-to-text tasks compared to the current SOTA model. Critically, we evaluated SeamlessM4T on gender bias and added toxicity to assess translation safety. Finally, all contributions in this work are open-sourced and accessible at https://github.com/facebookresearch/seamless_communication*
|
||||
|
||||
## Usage
|
||||
|
||||
First, load the processor and a checkpoint of the model:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoProcessor, SeamlessM4TModel
|
||||
|
||||
>>> processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
||||
>>> model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
|
||||
```
|
||||
|
||||
You can seamlessly use this model on text or on audio, to generated either translated text or translated audio.
|
||||
|
||||
Here is how to use the processor to process text and audio:
|
||||
|
||||
```python
|
||||
>>> # let's load an audio sample from an Arabic speech corpus
|
||||
>>> from datasets import load_dataset
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
|
||||
>>> audio_sample = next(iter(dataset))["audio"]
|
||||
|
||||
>>> # now, process it
|
||||
>>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
|
||||
|
||||
>>> # now, process some English test as well
|
||||
>>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
|
||||
```
|
||||
|
||||
|
||||
### Speech
|
||||
|
||||
[`SeamlessM4TModel`] can *seamlessly* generate text or speech with few or no changes. Let's target Russian voice translation:
|
||||
|
||||
```python
|
||||
>>> audio_array_from_text = model.generate(**text_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()
|
||||
>>> audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()
|
||||
```
|
||||
|
||||
With basically the same code, I've translated English text and Arabic speech to Russian speech samples.
|
||||
|
||||
### Text
|
||||
|
||||
Similarly, you can generate translated text from audio files or from text with the same model. You only have to pass `generate_speech=False` to [`SeamlessM4TModel.generate`].
|
||||
This time, let's translate to French.
|
||||
|
||||
```python
|
||||
>>> # from audio
|
||||
>>> output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False)
|
||||
>>> translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
||||
|
||||
>>> # from text
|
||||
>>> output_tokens = model.generate(**text_inputs, tgt_lang="fra", generate_speech=False)
|
||||
>>> translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
### Tips
|
||||
|
||||
|
||||
#### 1. Use dedicated models
|
||||
|
||||
[`SeamlessM4TModel`] is transformers top level model to generate speech and text, but you can also use dedicated models that perform the task without additional components, thus reducing the memory footprint.
|
||||
For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code:
|
||||
|
||||
```python
|
||||
>>> from transformers import SeamlessM4TForSpeechToSpeech
|
||||
>>> model = SeamlessM4TForSpeechToSpeech.from_pretrained("facebook/hf-seamless-m4t-medium")
|
||||
```
|
||||
|
||||
Or you can replace the text-to-text generation snippet with the model dedicated to the T2TT task, you only have to remove `generate_speech=False`.
|
||||
|
||||
```python
|
||||
>>> from transformers import SeamlessM4TForTextToText
|
||||
>>> model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
|
||||
```
|
||||
|
||||
Feel free to try out [`SeamlessM4TForSpeechToText`] and [`SeamlessM4TForTextToSpeech`] as well.
|
||||
|
||||
#### 2. Change the speaker identity
|
||||
|
||||
You have the possibility to change the speaker used for speech synthesis with the `spkr_id` argument. Some `spkr_id` works better than other for some languages!
|
||||
|
||||
#### 3. Change the generation strategy
|
||||
|
||||
You can use different [generation strategies](./generation_strategies) for speech and text generation, e.g `.generate(input_ids=input_ids, text_num_beams=4, speech_do_sample=True)` which will successively perform beam-search decoding on the text model, and multinomial sampling on the speech model.
|
||||
|
||||
#### 4. Generate speech and text at the same time
|
||||
|
||||
Use `return_intermediate_token_ids=True` with [`SeamlessM4TModel`] to return both speech and text !
|
||||
|
||||
## Model architecture
|
||||
|
||||
|
||||
SeamlessM4T features a versatile architecture that smoothly handles the sequential generation of text and speech. This setup comprises two sequence-to-sequence (seq2seq) models. The first model translates the input modality into translated text, while the second model generates speech tokens, known as "unit tokens," from the translated text.
|
||||
|
||||
Each modality has its own dedicated encoder with a unique architecture. Additionally, for speech output, a vocoder inspired by the [HiFi-GAN](https://arxiv.org/abs/2010.05646) architecture is placed on top of the second seq2seq model.
|
||||
|
||||
Here's how the generation process works:
|
||||
|
||||
- Input text or speech is processed through its specific encoder.
|
||||
- A decoder creates text tokens in the desired language.
|
||||
- If speech generation is required, the second seq2seq model, following a standard encoder-decoder structure, generates unit tokens.
|
||||
- These unit tokens are then passed through the final vocoder to produce the actual speech.
|
||||
|
||||
|
||||
This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/seamless_communication).
|
||||
|
||||
## SeamlessM4TModel
|
||||
|
||||
[[autodoc]] SeamlessM4TModel
|
||||
- generate
|
||||
|
||||
|
||||
## SeamlessM4TForTextToSpeech
|
||||
|
||||
[[autodoc]] SeamlessM4TForTextToSpeech
|
||||
- generate
|
||||
|
||||
|
||||
## SeamlessM4TForSpeechToSpeech
|
||||
|
||||
[[autodoc]] SeamlessM4TForSpeechToSpeech
|
||||
- generate
|
||||
|
||||
|
||||
## SeamlessM4TForTextToText
|
||||
|
||||
[[autodoc]] transformers.SeamlessM4TForTextToText
|
||||
- forward
|
||||
- generate
|
||||
|
||||
## SeamlessM4TForSpeechToText
|
||||
|
||||
[[autodoc]] transformers.SeamlessM4TForSpeechToText
|
||||
- forward
|
||||
- generate
|
||||
|
||||
## SeamlessM4TConfig
|
||||
|
||||
[[autodoc]] SeamlessM4TConfig
|
||||
|
||||
|
||||
## SeamlessM4TTokenizer
|
||||
|
||||
[[autodoc]] SeamlessM4TTokenizer
|
||||
- __call__
|
||||
- build_inputs_with_special_tokens
|
||||
- get_special_tokens_mask
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
|
||||
|
||||
## SeamlessM4TTokenizerFast
|
||||
|
||||
[[autodoc]] SeamlessM4TTokenizerFast
|
||||
- __call__
|
||||
|
||||
## SeamlessM4TFeatureExtractor
|
||||
|
||||
[[autodoc]] SeamlessM4TFeatureExtractor
|
||||
- __call__
|
||||
|
||||
## SeamlessM4TProcessor
|
||||
|
||||
[[autodoc]] SeamlessM4TProcessor
|
||||
- __call__
|
||||
|
||||
## SeamlessM4TCodeHifiGan
|
||||
|
||||
[[autodoc]] SeamlessM4TCodeHifiGan
|
||||
|
||||
|
||||
## SeamlessM4THifiGan
|
||||
|
||||
[[autodoc]] SeamlessM4THifiGan
|
||||
|
||||
## SeamlessM4TTextToUnitModel
|
||||
|
||||
[[autodoc]] SeamlessM4TTextToUnitModel
|
||||
|
||||
## SeamlessM4TTextToUnitForConditionalGeneration
|
||||
|
||||
[[autodoc]] SeamlessM4TTextToUnitForConditionalGeneration
|
||||
|
||||
|
||||
@ -47,7 +47,7 @@ review it! The resource should ideally demonstrate something new instead of dupl
|
||||
**Video classification**
|
||||
- [A notebook](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) that shows how
|
||||
to fine-tune a VideoMAE model on a custom dataset.
|
||||
- [Video classification task guide](../tasks/video-classification)
|
||||
- [Video classification task guide](../tasks/video_classification)
|
||||
- [A ЁЯдЧ Space](https://huggingface.co/spaces/sayakpaul/video-classification-ucf101-subset) showing how to perform inference with a video classification model.
|
||||
|
||||
|
||||
|
||||
@ -10,12 +10,12 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# VitMatte
|
||||
# ViTMatte
|
||||
|
||||
## Overview
|
||||
|
||||
The VitMatte model was proposed in [Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
VitMatte leverages plain [Vision Transformers](vit) for the task of image matting, which is the process of accurately estimating the foreground object in images and videos.
|
||||
The ViTMatte model was proposed in [Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
|
||||
ViTMatte leverages plain [Vision Transformers](vit) for the task of image matting, which is the process of accurately estimating the foreground object in images and videos.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
@ -28,6 +28,17 @@ Tips:
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr).
|
||||
The original code can be found [here](https://github.com/hustvl/ViTMatte).
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitmatte_architecture.png"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> ViTMatte high-level overview. Taken from the <a href="https://arxiv.org/abs/2305.15272">original paper.</a> </small>
|
||||
|
||||
## Resources
|
||||
|
||||
A list of official Hugging Face and community (indicated by ЁЯМО) resources to help you get started with ViTMatte.
|
||||
|
||||
- A demo notebook regarding inference with [`VitMatteForImageMatting`], including background replacement, can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ViTMatte).
|
||||
|
||||
|
||||
## VitMatteConfig
|
||||
|
||||
|
||||
@ -48,6 +48,8 @@ The original code can be found [here](https://github.com/openai/whisper).
|
||||
- get_special_tokens_mask
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
- batch_decode
|
||||
- decode
|
||||
|
||||
## WhisperTokenizerFast
|
||||
|
||||
@ -57,6 +59,8 @@ The original code can be found [here](https://github.com/openai/whisper).
|
||||
- get_special_tokens_mask
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
- batch_decode
|
||||
- decode
|
||||
|
||||
## WhisperFeatureExtractor
|
||||
|
||||
@ -82,6 +86,12 @@ The original code can be found [here](https://github.com/openai/whisper).
|
||||
|
||||
[[autodoc]] WhisperForConditionalGeneration
|
||||
- forward
|
||||
- generate
|
||||
|
||||
## WhisperForCausalLM
|
||||
|
||||
[[autodoc]] WhisperForCausalLM
|
||||
- forward
|
||||
|
||||
## WhisperForAudioClassification
|
||||
|
||||
|
||||
@ -13,46 +13,48 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Efficient Inference on CPU
|
||||
# CPU inference
|
||||
|
||||
This guide focuses on inferencing large models efficiently on CPU.
|
||||
With some optimizations, it is possible to efficiently run large model inference on a CPU. One of these optimization techniques involves compiling the PyTorch code into an intermediate format for high-performance environments like C++. The other technique fuses multiple operations into one kernel to reduce the overhead of running each operation separately.
|
||||
|
||||
## `BetterTransformer` for faster inference
|
||||
You'll learn how to use [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) for faster inference, and how to convert your PyTorch code to [TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html). If you're using an Intel CPU, you can also use [graph optimizations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features.html#graph-optimization) from [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/index.html) to boost inference speed even more. Finally, learn how to use ЁЯдЧ Optimum to accelerate inference with ONNX Runtime or OpenVINO (if you're using an Intel CPU).
|
||||
|
||||
We have recently integrated `BetterTransformer` for faster inference on CPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details.
|
||||
## BetterTransformer
|
||||
|
||||
## PyTorch JIT-mode (TorchScript)
|
||||
TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency.
|
||||
Comparing to default eager mode, jit mode in PyTorch normally yields better performance for model inference from optimization methodologies like operator fusion.
|
||||
BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:
|
||||
|
||||
For a gentle introduction to TorchScript, see the Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
|
||||
1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
|
||||
2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors
|
||||
|
||||
### IPEX Graph Optimization with JIT-mode
|
||||
Intel┬о Extension for PyTorch provides further optimizations in jit mode for Transformers series models. It is highly recommended for users to take advantage of Intel┬о Extension for PyTorch with jit mode. Some frequently used operator patterns from Transformers models are already supported in Intel┬о Extension for PyTorch with jit mode fusions. Those fusion patterns like Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. are enabled and perform well. The benefit of the fusion is delivered to users in a transparent fashion. According to the analysis, ~70% of most popular NLP tasks in question-answering, text-classification, and token-classification can get performance benefits with these fusion patterns for both Float32 precision and BFloat16 Mixed precision.
|
||||
BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention).
|
||||
|
||||
Check more detailed information for [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html).
|
||||
<Tip>
|
||||
|
||||
#### IPEX installation:
|
||||
|
||||
IPEX release is following PyTorch, check the approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/).
|
||||
|
||||
### Usage of JIT-mode
|
||||
To enable JIT-mode in Trainer for evaluaion or prediction, users should add `jit_mode_eval` in Trainer command arguments.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
for PyTorch >= 1.14.0. JIT-mode could benefit any models for prediction and evaluaion since dict input is supported in jit.trace
|
||||
|
||||
for PyTorch < 1.14.0. JIT-mode could benefit models whose forward parameter order matches the tuple input order in jit.trace, like question-answering model
|
||||
In the case where the forward parameter order does not match the tuple input order in jit.trace, like text-classification models, jit.trace will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users.
|
||||
BetterTransformer is not supported for all models. Check this [list](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models) to see if a model supports BetterTransformer.
|
||||
|
||||
</Tip>
|
||||
|
||||
Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
|
||||
Before you start, make sure you have ЁЯдЧ Optimum [installed](https://huggingface.co/docs/optimum/installation).
|
||||
|
||||
Enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:
|
||||
|
||||
- Inference using jit mode on CPU:
|
||||
<pre>python run_qa.py \
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder")
|
||||
model.to_bettertransformer()
|
||||
```
|
||||
|
||||
## TorchScript
|
||||
|
||||
TorchScript is an intermediate PyTorch model representation that can be run in production environments where performance is important. You can train a model in PyTorch and then export it to TorchScript to free the model from Python performance constraints. PyTorch [traces](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) a model to return a [`ScriptFunction`] that is optimized with just-in-time compilation (JIT). Compared to the default eager mode, JIT mode in PyTorch typically yields better performance for inference using optimization techniques like operator fusion.
|
||||
|
||||
For a gentle introduction to TorchScript, see the [Introduction to PyTorch TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) tutorial.
|
||||
|
||||
With the [`Trainer`] class, you can enable JIT mode for CPU inference by setting the `--jit_mode_eval` flag:
|
||||
|
||||
```bash
|
||||
python run_qa.py \
|
||||
--model_name_or_path csarron/bert-base-uncased-squad-v1 \
|
||||
--dataset_name squad \
|
||||
--do_eval \
|
||||
@ -60,10 +62,31 @@ Take an example of the use cases on [Transformers question-answering](https://gi
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/ \
|
||||
--no_cuda \
|
||||
<b>--jit_mode_eval </b></pre>
|
||||
--jit_mode_eval
|
||||
```
|
||||
|
||||
- Inference with IPEX using jit mode on CPU:
|
||||
<pre>python run_qa.py \
|
||||
<Tip warning={true}>
|
||||
|
||||
For PyTorch >= 1.14.0, JIT-mode could benefit any model for prediction and evaluaion since the dict input is supported in `jit.trace`.
|
||||
|
||||
For PyTorch < 1.14.0, JIT-mode could benefit a model if its forward parameter order matches the tuple input order in `jit.trace`, such as a question-answering model. If the forward parameter order does not match the tuple input order in `jit.trace`, like a text classification model, `jit.trace` will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users.
|
||||
|
||||
</Tip>
|
||||
|
||||
## IPEX graph optimization
|
||||
|
||||
Intel┬о Extension for PyTorch (IPEX) provides further optimizations in JIT mode for Intel CPUs, and we recommend combining it with TorchScript for even faster performance. The IPEX [graph optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html) fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more.
|
||||
|
||||
To take advantage of these graph optimizations, make sure you have IPEX [installed](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html):
|
||||
|
||||
```bash
|
||||
pip install intel_extension_for_pytorch
|
||||
```
|
||||
|
||||
Set the `--use_ipex` and `--jit_mode_eval` flags in the [`Trainer`] class to enable JIT mode with the graph optimizations:
|
||||
|
||||
```bash
|
||||
python run_qa.py \
|
||||
--model_name_or_path csarron/bert-base-uncased-squad-v1 \
|
||||
--dataset_name squad \
|
||||
--do_eval \
|
||||
@ -71,5 +94,34 @@ Take an example of the use cases on [Transformers question-answering](https://gi
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/ \
|
||||
--no_cuda \
|
||||
<b>--use_ipex \</b>
|
||||
<b>--jit_mode_eval</b></pre>
|
||||
--use_ipex \
|
||||
--jit_mode_eval
|
||||
```
|
||||
|
||||
## ЁЯдЧ Optimum
|
||||
|
||||
<Tip>
|
||||
|
||||
Learn more details about using ORT with ЁЯдЧ Optimum in the [Optimum Inference with ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/models) guide. This section only provides a brief and simple example.
|
||||
|
||||
</Tip>
|
||||
|
||||
ONNX Runtime (ORT) is a model accelerator that runs inference on CPUs by default. ORT is supported by ЁЯдЧ Optimum which can be used in ЁЯдЧ Transformers, without making too many changes to your code. You only need to replace the ЁЯдЧ Transformers `AutoClass` with its equivalent [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and load a checkpoint in the ONNX format.
|
||||
|
||||
For example, if you're running inference on a question answering task, load the [optimum/roberta-base-squad2](https://huggingface.co/optimum/roberta-base-squad2) checkpoint which contains a `model.onnx` file:
|
||||
|
||||
```py
|
||||
from transformers import AutoTokenizer, pipeline
|
||||
from optimum.onnxruntime import ORTModelForQuestionAnswering
|
||||
|
||||
model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
|
||||
|
||||
onnx_qa = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
||||
|
||||
question = "What's my name?"
|
||||
context = "My name is Philipp and I live in Nuremberg."
|
||||
pred = onnx_qa(question, context)
|
||||
```
|
||||
|
||||
If you have an Intel CPU, take a look at ЁЯдЧ [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) which supports a variety of compression techniques (quantization, pruning, knowledge distillation) and tools for converting models to the [OpenVINO](https://huggingface.co/docs/optimum/intel/inference) format for higher performance inference.
|
||||
|
||||
@ -1,120 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Efficient Inference on a Multiple GPUs
|
||||
|
||||
This document contains information on how to efficiently infer on a multiple GPUs.
|
||||
<Tip>
|
||||
|
||||
Note: A multi GPU setup can use the majority of the strategies described in the [single GPU section](./perf_infer_gpu_one). You must be aware of simple techniques, though, that can be used for a better usage.
|
||||
|
||||
</Tip>
|
||||
|
||||
## BetterTransformer
|
||||
|
||||
[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts ЁЯдЧ Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood.
|
||||
|
||||
BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models.
|
||||
|
||||
<Tip>
|
||||
|
||||
Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Decoder models
|
||||
|
||||
For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards.
|
||||
|
||||
To convert a model to BetterTransformer:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
|
||||
# convert the model to BetterTransformer
|
||||
model.to_bettertransformer()
|
||||
|
||||
# Use it for training or inference
|
||||
```
|
||||
|
||||
SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
|
||||
|
||||
|
||||
```diff
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to("cuda")
|
||||
# convert the model to BetterTransformer
|
||||
model.to_bettertransformer()
|
||||
|
||||
input_text = "Hello my dog is cute and"
|
||||
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
|
||||
+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
|
||||
outputs = model.generate(**inputs)
|
||||
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
If you see a bug with a traceback saying
|
||||
|
||||
```bash
|
||||
RuntimeError: No available kernel. Aborting execution.
|
||||
```
|
||||
|
||||
try using the PyTorch nightly version, which may have a broader coverage for Flash Attention:
|
||||
|
||||
```bash
|
||||
pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
|
||||
```
|
||||
|
||||
Have a look at this [blog post](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about what is possible with the BetterTransformer + SDPA API.
|
||||
|
||||
### Encoder models
|
||||
|
||||
For encoder models during inference, BetterTransformer dispatches the forward call of encoder layers to an equivalent of [`torch.nn.TransformerEncoderLayer`](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html) that will execute the fastpath implementation of the encoder layers.
|
||||
|
||||
Because `torch.nn.TransformerEncoderLayer` fastpath does not support training, it is dispatched to `torch.nn.functional.scaled_dot_product_attention` instead, which does not leverage nested tensors but can use Flash Attention or Memory-Efficient Attention fused kernels.
|
||||
|
||||
More details about BetterTransformer performance can be found in this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2), and you can learn more about BetterTransformer for encoder models in this [blog](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/).
|
||||
|
||||
|
||||
## Advanced usage: mixing FP4 (or Int8) and BetterTransformer
|
||||
|
||||
You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config)
|
||||
|
||||
input_text = "Hello my dog is cute and"
|
||||
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
|
||||
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
|
||||
outputs = model.generate(**inputs)
|
||||
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
@ -13,61 +13,154 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Efficient Inference on a Single GPU
|
||||
# GPU inference
|
||||
|
||||
In addition to this guide, relevant information can be found as well in [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).
|
||||
|
||||
## BetterTransformer
|
||||
|
||||
[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts ЁЯдЧ Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood.
|
||||
|
||||
BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models.
|
||||
GPUs are the standard choice of hardware for machine learning, unlike CPUs, because they are optimized for memory bandwidth and parallelism. To keep up with the larger sizes of modern models or to run these large models on existing and older hardware, there are several optimizations you can use to speed up GPU inference. In this guide, you'll learn how to use FlashAttention-2 (a more memory-efficient attention mechanism), BetterTransformer (a PyTorch native fastpath execution), and bitsandbytes to quantize your model to a lower precision. Finally, learn how to use ЁЯдЧ Optimum to accelerate inference with ONNX Runtime on Nvidia GPUs.
|
||||
|
||||
<Tip>
|
||||
|
||||
Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer.
|
||||
The majority of the optimizations described here also apply to multi-GPU setups!
|
||||
|
||||
</Tip>
|
||||
|
||||
### Encoder models
|
||||
## FlashAttention-2
|
||||
|
||||
PyTorch-native [`nn.MultiHeadAttention`](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) attention fastpath, called BetterTransformer, can be used with Transformers through the integration in the [ЁЯдЧ Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview).
|
||||
<Tip>
|
||||
|
||||
PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).
|
||||
FlashAttention-2 is experimental and may change considerably in future versions.
|
||||
|
||||
After installing the [`optimum`](https://github.com/huggingface/optimum) package, to use Better Transformer during inference, the relevant internal modules are replaced by calling [`~PreTrainedModel.to_bettertransformer`]:
|
||||
</Tip>
|
||||
|
||||
[FlashAttention-2](https://huggingface.co/papers/2205.14135) is a faster and more efficient implementation of the standard attention mechanism that can significantly speedup inference by:
|
||||
|
||||
1. additionally parallelizing the attention computation over sequence length
|
||||
2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them
|
||||
|
||||
FlashAttention-2 supports inference with Llama, Mistral, and Falcon models. You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
|
||||
|
||||
Before you begin, make sure you have FlashAttention-2 installed (see the [installation](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) guide for more details about prerequisites):
|
||||
|
||||
```bash
|
||||
pip install flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
To enable FlashAttention-2, add the `use_flash_attention_2` parameter to [`~AutoModelForCausalLM.from_pretrained`]:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
|
||||
|
||||
model_id = "tiiuae/falcon-7b"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.bfloat16,
|
||||
use_flash_attention_2=True,
|
||||
)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
FlashAttention-2 can only be used when the model's dtype is `fp16` or `bf16`, and it only runs on Nvidia GPUs. Make sure to cast your model to the appropriate dtype and load them on a supported device before using FlashAttention-2.
|
||||
|
||||
</Tip>
|
||||
|
||||
FlashAttention-2 can be combined with other optimization techniques like quantization to further speedup inference. For example, you can combine FlashAttention-2 with 8-bit or 4-bit quantization:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
|
||||
|
||||
model_id = "tiiuae/falcon-7b"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
# load in 8bit
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
load_in_8bit=True,
|
||||
use_flash_attention_2=True,
|
||||
)
|
||||
|
||||
# load in 4bit
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
load_in_4bit=True,
|
||||
use_flash_attention_2=True,
|
||||
)
|
||||
```
|
||||
|
||||
### Expected speedups
|
||||
|
||||
You can benefit from considerable speedups for inference, especially for inputs with long sequences. However, since FlashAttention-2 does not support computing attention scores with padding tokens, you must manually pad/unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens.
|
||||
|
||||
To overcome this, you should use FlashAttention-2 without padding tokens in the sequence during training (by packing a dataset or [concatenating sequences](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516) until reaching the maximum sequence length).
|
||||
|
||||
For a single forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is:
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/falcon-7b-inference-large-seqlen.png">
|
||||
</div>
|
||||
|
||||
For a single forward pass on [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is:
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-7b-inference-large-seqlen.png">
|
||||
</div>
|
||||
|
||||
For sequences with padding tokens (generating with padding tokens), you need to unpad/pad the input sequences to correctly compute the attention scores. With a relatively small sequence length, a single forward pass creates overhead leading to a small speedup (in the example below, 30% of the input is filled with padding tokens):
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-small-seqlen-padding.png">
|
||||
</div>
|
||||
|
||||
But for larger sequence lengths, you can expect even more speedup benefits:
|
||||
|
||||
<Tip>
|
||||
|
||||
FlashAttention is more memory efficient, meaning you can train on much larger sequence lengths without running into out-of-memory issues. You can potentially reduce memory usage up to 20x for larger sequence lengths. Take a look at the [flash-attention](https://github.com/Dao-AILab/flash-attention) repository for more details.
|
||||
|
||||
</Tip>
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png">
|
||||
</div>
|
||||
|
||||
## BetterTransformer
|
||||
|
||||
<Tip>
|
||||
|
||||
Check out our benchmarks with BetterTransformer and scaled dot product attention in the [Out of the box acceleration and memory savings of ЁЯдЧ decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) and learn more about the fastpath execution in the [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post.
|
||||
|
||||
</Tip>
|
||||
|
||||
BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:
|
||||
|
||||
1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
|
||||
2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors
|
||||
|
||||
BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention (SDPA)](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention), and it calls optimized kernels like [FlashAttention](https://huggingface.co/papers/2205.14135) under the hood.
|
||||
|
||||
Before you start, make sure you have ЁЯдЧ Optimum [installed](https://huggingface.co/docs/optimum/installation).
|
||||
|
||||
Then you can enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:
|
||||
|
||||
```python
|
||||
model = model.to_bettertransformer()
|
||||
```
|
||||
|
||||
The method [`~PreTrainedModel.reverse_bettertransformer`] allows to go back to the original modeling, which should be used before saving the model in order to use the canonical transformers modeling:
|
||||
You can return the original Transformers model with the [`~PreTrainedModel.reverse_bettertransformer`] method. You should use this before saving your model to use the canonical Transformers modeling:
|
||||
|
||||
```python
|
||||
```py
|
||||
model = model.reverse_bettertransformer()
|
||||
model.save_pretrained("saved_model")
|
||||
```
|
||||
|
||||
Have a look at this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) to learn more about what is possible to do with `BetterTransformer` API for encoder models.
|
||||
### FlashAttention
|
||||
|
||||
### Decoder models
|
||||
|
||||
For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards.
|
||||
|
||||
To convert a model to BetterTransformer:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
|
||||
# convert the model to BetterTransformer
|
||||
model.to_bettertransformer()
|
||||
|
||||
# Use it for training or inference
|
||||
```
|
||||
|
||||
SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
|
||||
SDPA can also call FlashAttention kernels under the hood. FlashAttention can only be used for models using the `fp16` or `bf16` dtype, so make sure to cast your model to the appropriate dtype before using it.
|
||||
|
||||
To enable FlashAttention or to check whether it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
|
||||
|
||||
```diff
|
||||
import torch
|
||||
@ -87,47 +180,32 @@ inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
If you see a bug with a traceback saying
|
||||
If you see a bug with the traceback below, try using nightly version of PyTorch which may have broader coverage for FlashAttention:
|
||||
|
||||
```bash
|
||||
RuntimeError: No available kernel. Aborting execution.
|
||||
```
|
||||
|
||||
try using the PyTorch nightly version, which may have a broader coverage for Flash Attention:
|
||||
|
||||
```bash
|
||||
# install PyTorch nightly
|
||||
pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
|
||||
```
|
||||
|
||||
Or make sure your model is correctly casted in float16 or bfloat16
|
||||
## bitsandbytes
|
||||
|
||||
bitsandbytes is a quantization library that includes support for 4-bit and 8-bit quantization. Quantization reduces your model size compared to its native full precision version, making it easier to fit large models onto GPUs with limited memory.
|
||||
|
||||
Have a look at [this detailed blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to read more about what is possible to do with `BetterTransformer` + SDPA API.
|
||||
Make sure you have bitsnbytes and ЁЯдЧ Accelerate installed:
|
||||
|
||||
## `bitsandbytes` integration for FP4 mixed-precision inference
|
||||
```bash
|
||||
# these versions support 8-bit and 4-bit
|
||||
pip install bitsandbytes>=0.39.0 accelerate>=0.20.0
|
||||
|
||||
You can install `bitsandbytes` and benefit from easy model compression on GPUs. Using FP4 quantization you can expect to reduce up to 8x the model size compared to its native full precision version. Check out below how to get started.
|
||||
# install Transformers
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
<Tip>
|
||||
### 4-bit
|
||||
|
||||
Note that this feature can also be used in a multi GPU setup.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Requirements [[requirements-for-fp4-mixedprecision-inference]]
|
||||
|
||||
- Latest `bitsandbytes` library
|
||||
`pip install bitsandbytes>=0.39.0`
|
||||
|
||||
- Install latest `accelerate` from source
|
||||
`pip install git+https://github.com/huggingface/accelerate.git`
|
||||
|
||||
- Install latest `transformers` from source
|
||||
`pip install git+https://github.com/huggingface/transformers.git`
|
||||
|
||||
### Running FP4 models - single GPU setup - Quickstart
|
||||
|
||||
You can quickly run a FP4 model on a single GPU by running the following code:
|
||||
To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `device_map` parameter is optional, but we recommend setting it to `"auto"` to allow ЁЯдЧ Accelerate to automatically and efficiently allocate the model given the available resources in the environment.
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM
|
||||
@ -135,16 +213,8 @@ from transformers import AutoModelForCausalLM
|
||||
model_name = "bigscience/bloom-2b5"
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
|
||||
```
|
||||
Note that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
|
||||
|
||||
### Running FP4 models - multi GPU setup
|
||||
|
||||
The way to load your mixed 4-bit model in multiple GPUs is as follows (same command as single GPU setup):
|
||||
```py
|
||||
model_name = "bigscience/bloom-2b5"
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
|
||||
```
|
||||
But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
|
||||
To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU:
|
||||
|
||||
```py
|
||||
max_memory_mapping = {0: "600MB", 1: "1GB"}
|
||||
@ -153,44 +223,16 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
|
||||
)
|
||||
```
|
||||
In this example, the first GPU will use 600MB of memory and the second 1GB.
|
||||
|
||||
### Advanced usage
|
||||
|
||||
For more advanced usage of this method, please have a look at the [quantization](main_classes/quantization) documentation page.
|
||||
|
||||
## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
|
||||
### 8-bit
|
||||
|
||||
<Tip>
|
||||
|
||||
Note that this feature can also be used in a multi GPU setup.
|
||||
If you're curious and interested in learning more about the concepts underlying 8-bit quantization, read the [Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration) blog post.
|
||||
|
||||
</Tip>
|
||||
|
||||
From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support Hugging Face integration for all models in the Hub with a few lines of code.
|
||||
The method reduces `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision.
|
||||
|
||||

|
||||
|
||||
Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models.
|
||||
For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration).
|
||||
|
||||

|
||||
|
||||
Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature.
|
||||
Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
|
||||
|
||||
### Requirements [[requirements-for-int8-mixedprecision-matrix-decomposition]]
|
||||
|
||||
- If you have `bitsandbytes<0.37.0`, make sure you run on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100). For `bitsandbytes>=0.37.0`, all GPUs should be supported.
|
||||
- Install the correct version of `bitsandbytes` by running:
|
||||
`pip install bitsandbytes>=0.31.5`
|
||||
- Install `accelerate`
|
||||
`pip install accelerate>=0.12.0`
|
||||
|
||||
### Running mixed-Int8 models - single GPU setup
|
||||
|
||||
After installing the required libraries, the way to load your mixed 8-bit model is as follows:
|
||||
To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `device_map` parameter is optional, but we recommend setting it to `"auto"` to allow ЁЯдЧ Accelerate to automatically and efficiently allocate the model given the available resources in the environment:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM
|
||||
@ -199,12 +241,7 @@ model_name = "bigscience/bloom-2b5"
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
|
||||
```
|
||||
|
||||
For text generation, we recommend:
|
||||
|
||||
* using the model's `generate()` method instead of the `pipeline()` function. Although inference is possible with the `pipeline()` function, it is not optimized for mixed-8bit models, and will be slower than using the `generate()` method. Moreover, some sampling strategies are like nucleaus sampling are not supported by the `pipeline()` function for mixed-8bit models.
|
||||
* placing all inputs on the same device as the model.
|
||||
|
||||
Here is a simple example:
|
||||
If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
@ -219,15 +256,7 @@ generated_ids = model.generate(**inputs)
|
||||
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
```
|
||||
|
||||
|
||||
### Running mixed-int8 models - multi GPU setup
|
||||
|
||||
The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup):
|
||||
```py
|
||||
model_name = "bigscience/bloom-2b5"
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
|
||||
```
|
||||
But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
|
||||
To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:
|
||||
|
||||
```py
|
||||
max_memory_mapping = {0: "1GB", 1: "2GB"}
|
||||
@ -236,27 +265,56 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
|
||||
)
|
||||
```
|
||||
In this example, the first GPU will use 1GB of memory and the second 2GB.
|
||||
|
||||
### Colab demos
|
||||
<Tip>
|
||||
|
||||
With this method you can infer on models that were not possible to infer on a Google Colab before.
|
||||
Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab:
|
||||
Feel free to try running a 11 billion parameter [T5 model](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing) or the 3 billion parameter [BLOOM model](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing) for inference on Google Colab's free tier GPUs!
|
||||
|
||||
[](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
|
||||
</Tip>
|
||||
|
||||
Or this demo for BLOOM-3B:
|
||||
## ЁЯдЧ Optimum
|
||||
|
||||
[](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
|
||||
<Tip>
|
||||
|
||||
## Advanced usage: mixing FP4 (or Int8) and BetterTransformer
|
||||
Learn more details about using ORT with ЁЯдЧ Optimum in the [Accelerated inference on NVIDIA GPUs](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#accelerated-inference-on-nvidia-gpus) guide. This section only provides a brief and simple example.
|
||||
|
||||
You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention:
|
||||
</Tip>
|
||||
|
||||
ONNX Runtime (ORT) is a model accelerator that supports accelerated inference on Nvidia GPUs. ORT uses optimization techniques like fusing common operations into a single node and constant folding to reduce the number of computations performed and speedup inference. ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices.
|
||||
|
||||
ORT is supported by ЁЯдЧ Optimum which can be used in ЁЯдЧ Transformers. You'll need to use an [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and specify the `provider` parameter which can be set to either [`CUDAExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#cudaexecutionprovider) or [`TensorrtExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#tensorrtexecutionprovider). If you want to load a model that was not yet exported to ONNX, you can set `export=True` to convert your model on-the-fly to the ONNX format :
|
||||
|
||||
```py
|
||||
from optimum.onnxruntime import ORTModelForSequenceClassification
|
||||
|
||||
ort_model = ORTModelForSequenceClassification.from_pretrained(
|
||||
"distilbert-base-uncased-finetuned-sst-2-english",
|
||||
export=True,
|
||||
provider="CUDAExecutionProvider",
|
||||
)
|
||||
```
|
||||
|
||||
Now you're free to use the model for inference:
|
||||
|
||||
```py
|
||||
from optimum.pipelines import pipeline
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
|
||||
|
||||
pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
|
||||
result = pipeline("Both the music and visual were astounding, not to mention the actors performance.")
|
||||
```
|
||||
|
||||
## Combine optimizations
|
||||
|
||||
It is often possible to combine several of the optimization techniques described above to get the best inference performance possible for your model. For example, you can load a model in 4-bit, and then enable BetterTransformer with FlashAttention:
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
|
||||
# load model in 4-bit
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
@ -265,9 +323,13 @@ quantization_config = BitsAndBytesConfig(
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config)
|
||||
|
||||
# enable BetterTransformer
|
||||
model = model.to_bettertransformer()
|
||||
|
||||
input_text = "Hello my dog is cute and"
|
||||
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
|
||||
# enable FlashAttention
|
||||
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
|
||||
outputs = model.generate(**inputs)
|
||||
|
||||
|
||||
@ -15,143 +15,154 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Efficient Training on Multiple GPUs
|
||||
|
||||
When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a multi-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
|
||||
If training a model on a single GPU is too slow or if the model's weights do not fit in a single GPU's memory, transitioning
|
||||
to a multi-GPU setup may be a viable option. Prior to making this transition, thoroughly explore all the strategies covered
|
||||
in the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) as they are universally applicable
|
||||
to model training on any number of GPUs. Once you have employed those strategies and found them insufficient for your
|
||||
case on a single GPU, consider moving to multiple GPUs.
|
||||
|
||||
Transitioning from a single GPU to multiple GPUs requires the introduction of some form of parallelism, as the workload
|
||||
must be distributed across the resources. Multiple techniques can be employed to achieve parallelism, such as data
|
||||
parallelism, tensor parallelism, and pipeline parallelism. It's important to note that there isn't a one-size-fits-all
|
||||
solution, and the optimal settings depend on the specific hardware configuration you are using.
|
||||
|
||||
This guide offers an in-depth overview of individual types of parallelism, as well as guidance on ways to combine
|
||||
techniques and choosing an appropriate approach. For step-by-step tutorials on distributed training, please refer to
|
||||
the [ЁЯдЧ Accelerate documentation](https://huggingface.co/docs/accelerate/index).
|
||||
|
||||
<Tip>
|
||||
|
||||
Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
|
||||
While the main concepts discussed in this guide are likely applicable across frameworks, here we focus on
|
||||
PyTorch-based implementations.
|
||||
|
||||
</Tip>
|
||||
|
||||
We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
|
||||
Before diving deeper into the specifics of each technique, let's go over the rough decision process when training
|
||||
large models on a large infrastructure.
|
||||
|
||||
## Concepts
|
||||
## Scalability strategy
|
||||
|
||||
The following is the brief description of the main concepts that will be described later in depth in this document.
|
||||
Begin by estimating how much vRAM is required to train your model. For models hosted on the ЁЯдЧ Hub, use our
|
||||
[Model Memory Calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage), which gives you
|
||||
accurate calculations within a few percent margin.
|
||||
|
||||
1. **DataParallel (DP)** - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
|
||||
2. **TensorParallel (TP)** - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
|
||||
3. **PipelineParallel (PP)** - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
|
||||
4. **Zero Redundancy Optimizer (ZeRO)** - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
|
||||
5. **Sharded DDP** - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
|
||||
**Parallelization strategy for a single Node / multi-GPU setup**
|
||||
|
||||
Before diving deeper into the specifics of each concept we first have a look at the rough decision process when training large models on a large infrastructure.
|
||||
When training a model on a single node with multiple GPUs, your choice of parallelization strategy can significantly
|
||||
impact performance. Here's a breakdown of your options:
|
||||
|
||||
## Scalability Strategy
|
||||
**Case 1: Your model fits onto a single GPU**
|
||||
|
||||
**тЗи Single Node / Multi-GPU**
|
||||
* Model fits onto a single GPU:
|
||||
If your model can comfortably fit onto a single GPU, you have two primary options:
|
||||
|
||||
1. DDP - Distributed DP
|
||||
2. ZeRO - may or may not be faster depending on the situation and configuration used
|
||||
1. DDP - Distributed DataParallel
|
||||
2. ZeRO - depending on the situation and configuration used, this method may or may not be faster, however, it's worth experimenting with it.
|
||||
|
||||
* Model doesn't fit onto a single GPU:
|
||||
**Case 2: Your model doesn't fit onto a single GPU:**
|
||||
|
||||
1. PP
|
||||
If your model is too large for a single GPU, you have several alternatives to consider:
|
||||
|
||||
1. PipelineParallel (PP)
|
||||
2. ZeRO
|
||||
3. TP
|
||||
3. TensorParallel (TP)
|
||||
|
||||
With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
|
||||
With very fast inter-node connectivity (e.g., NVLINK or NVSwitch) all three strategies (PP, ZeRO, TP) should result in
|
||||
similar performance. However, without these, PP will be faster than TP or ZeRO. The degree of TP may also
|
||||
make a difference. It's best to experiment with your specific setup to determine the most suitable strategy.
|
||||
|
||||
TP is almost always used within a single node. That is TP size <= gpus per node.
|
||||
TP is almost always used within a single node. That is TP size <= GPUs per node.
|
||||
|
||||
* Largest Layer not fitting into a single GPU:
|
||||
**Case 3: Largest layer of your model does not fit onto a single GPU**
|
||||
|
||||
1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
|
||||
2. With ZeRO see the same entry for "Single GPU" above
|
||||
1. If you are not using ZeRO, you have to use TensorParallel (TP), because PipelineParallel (PP) alone won't be sufficient to accommodate the large layer.
|
||||
2. If you are using ZeRO, additionally adopt techniques from the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one).
|
||||
|
||||
**Parallelization strategy for a multi-Node / multi-GPU setup**
|
||||
|
||||
**тЗи Multi-Node / Multi-GPU**
|
||||
|
||||
* When you have fast inter-node connectivity:
|
||||
* When you have fast inter-node connectivity (e.g., NVLINK or NVSwitch) consider using one of these options:
|
||||
|
||||
1. ZeRO - as it requires close to no modifications to the model
|
||||
2. PP+TP+DP - less communications, but requires massive changes to the model
|
||||
2. A combination of PipelineParallel(PP) with TensorParallel(TP) and DataParallel(DP) - this approach will result in fewer communications, but requires significant changes to the model
|
||||
|
||||
* when you have slow inter-node connectivity and still low on GPU memory:
|
||||
|
||||
1. DP+PP+TP+ZeRO-1
|
||||
* When you have slow inter-node connectivity and still low on GPU memory:
|
||||
|
||||
1. Employ a combination of DataParallel(DP) with PipelineParallel(PP), TensorParallel(TP), and ZeRO.
|
||||
|
||||
In the following sections of this guide we dig deeper into how these different parallelism methods work.
|
||||
|
||||
## Data Parallelism
|
||||
|
||||
Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch. Note that in general it is advised to use DDP as it is better maintained and works for all models while DP might fail for some models. [PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) itself recommends the use of DDP.
|
||||
Even with only 2 GPUs, you can readily leverage the accelerated training capabilities offered by PyTorch's built-in features,
|
||||
such as `DataParallel` (DP) and `DistributedDataParallel` (DDP). Note that
|
||||
[PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) recommends to prefer
|
||||
`DistributedDataParallel` (DDP) over `DataParallel` (DP) for multi-GPU training as it works for all models.
|
||||
Let's take a look at how these two methods work and what makes them different.
|
||||
|
||||
### DP vs DDP
|
||||
### DataParallel vs DistributedDataParallel
|
||||
|
||||
`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case:
|
||||
* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL
|
||||
* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP
|
||||
|
||||
Here are the main differences in the inter-GPU communication overhead between the two modes:
|
||||
To understand the key differences in inter-GPU communication overhead between the two methods, let's review the processes per batch:
|
||||
|
||||
[DDP](https://pytorch.org/docs/master/notes/ddp.html):
|
||||
|
||||
- At the start time the main process replicates the model once from gpu 0 to the rest of gpus
|
||||
- At the start time the main process replicates the model once from GPU 0 to the rest of GPUs
|
||||
- Then for each batch:
|
||||
1. each gpu consumes each own mini-batch of data directly
|
||||
2. during `backward`, once the local gradients are ready, they are then averaged across all processes
|
||||
1. Each GPU directly consumes its mini-batch of data.
|
||||
2. During `backward`, once the local gradients are ready, they are averaged across all processes.
|
||||
|
||||
[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):
|
||||
|
||||
For each batch:
|
||||
1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu
|
||||
2. replicates the up-to-date model from gpu 0 to each gpu
|
||||
3. runs `forward` and sends output from each gpu to gpu 0, computes loss
|
||||
4. scatters loss from gpu 0 to all gpus, runs `backward`
|
||||
5. sends gradients from each gpu to gpu 0 and averages those
|
||||
1. GPU 0 reads the batch of data and then sends a mini-batch to each GPU.
|
||||
2. The up-to-date model is replicated from GPU 0 to each GPU.
|
||||
3. `forward` is executed, and output from each GPU is sent to GPU 0 to compute the loss.
|
||||
4. The loss is distributed from GPU 0 to all GPUs, and `backward` is run.
|
||||
5. Gradients from each GPU are sent to GPU 0 and averaged.
|
||||
|
||||
The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch.
|
||||
Key differences include:
|
||||
1. DDP performs only a single communication per batch - sending gradients, while DP performs five different data exchanges per batch.
|
||||
DDP copies data using [torch.distributed](https://pytorch.org/docs/master/distributed.html), while DP copies data within
|
||||
the process via Python threads (which introduces limitations associated with GIL). As a result, **`DistributedDataParallel` (DDP) is generally faster than `DataParallel` (DP)** unless you have slow GPU card inter-connectivity.
|
||||
2. Under DP, GPU 0 performs significantly more work than other GPUs, resulting in GPU under-utilization.
|
||||
3. DDP supports distributed training across multiple machines, whereas DP does not.
|
||||
|
||||
DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html).
|
||||
This is not an exhaustive list of differences between DP and DDP, however, other nuances are out of scope of this guide.
|
||||
You can get a deeper understanding of these methods by reading this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/).
|
||||
|
||||
Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus.
|
||||
Let's illustrate the differences between DP and DDP with an experiment. We'll benchmark the differences between DP and
|
||||
DDP with an added context of NVLink presence:
|
||||
|
||||
You can use DDP across multiple machines, but this is not the case with DP.
|
||||
* Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`).
|
||||
* Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`.
|
||||
|
||||
There are other differences between DP and DDP but they aren't relevant to this discussion.
|
||||
To disable the NVLink feature on one of the benchmarks, we use `NCCL_P2P_DISABLE=1`.
|
||||
|
||||
If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know.
|
||||
Here is the benchmarking code and outputs:
|
||||
|
||||
Let's look at an actual benchmark:
|
||||
|
||||
| Type | NVlink | Time |
|
||||
| :----- | ----- | ---: |
|
||||
| 2:DP | Y | 110s |
|
||||
| 2:DDP | Y | 101s |
|
||||
| 2:DDP | N | 131s |
|
||||
|
||||
|
||||
Analysis:
|
||||
|
||||
Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink
|
||||
|
||||
The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime.
|
||||
|
||||
Here is the full benchmark code and outputs:
|
||||
|
||||
`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark.
|
||||
**DP**
|
||||
|
||||
```
|
||||
|
||||
# DP
|
||||
rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
|
||||
python examples/pytorch/language-modeling/run_clm.py \
|
||||
--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
|
||||
|
||||
{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
|
||||
```
|
||||
|
||||
# DDP w/ NVlink
|
||||
**DDP w/ NVlink**
|
||||
|
||||
```
|
||||
rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
|
||||
python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
|
||||
--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
|
||||
|
||||
{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
|
||||
```
|
||||
|
||||
# DDP w/o NVlink
|
||||
**DDP w/o NVlink**
|
||||
|
||||
```
|
||||
rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
|
||||
python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
|
||||
--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
|
||||
@ -160,17 +171,34 @@ python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-
|
||||
{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
|
||||
```
|
||||
|
||||
Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
|
||||
Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
|
||||
Here are the same benchmarking results gathered in a table for convenience:
|
||||
|
||||
| Type | NVlink | Time |
|
||||
| :----- | ----- | ---: |
|
||||
| 2:DP | Y | 110s |
|
||||
| 2:DDP | Y | 101s |
|
||||
| 2:DDP | N | 131s |
|
||||
|
||||
As you can see, in this case DP is ~10% slower than DDP with NVlink, but ~15% faster than DDP without NVlink.
|
||||
The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync,
|
||||
the more a slow link will impede the overall runtime.
|
||||
|
||||
## ZeRO Data Parallelism
|
||||
|
||||
ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
|
||||

|
||||
ZeRO-powered data parallelism (ZeRO-DP) is illustrated in the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/).
|
||||
|
||||
It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual `DataParallel` (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it. And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it.
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png" alt="DeepSpeed-Image-1"/>
|
||||
</div>
|
||||
|
||||
While it may appear complex, it is a very similar concept to `DataParallel` (DP). The difference is that instead of
|
||||
replicating the full model parameters, gradients and optimizer states, each GPU stores only a slice of it. Then, at
|
||||
run-time when the full layer parameters are needed just for the given layer, all GPUs synchronize to give each other
|
||||
parts that they miss.
|
||||
|
||||
To illustrate this idea, consider a simple model with 3 layers (La, Lb, and Lc), where each layer has 3 parameters.
|
||||
Layer La, for example, has weights a0, a1 and a2:
|
||||
|
||||
Consider this simple model with 3 layers, where each layer has 3 params:
|
||||
```
|
||||
La | Lb | Lc
|
||||
---|----|---
|
||||
@ -178,9 +206,8 @@ a0 | b0 | c0
|
||||
a1 | b1 | c1
|
||||
a2 | b2 | c2
|
||||
```
|
||||
Layer La has weights a0, a1 and a2.
|
||||
|
||||
If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:
|
||||
If we have 3 GPUs, ZeRO-DP splits the model onto 3 GPUs like so:
|
||||
|
||||
```
|
||||
GPU0:
|
||||
@ -199,165 +226,213 @@ La | Lb | Lc
|
||||
a2 | b2 | c2
|
||||
```
|
||||
|
||||
In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point.
|
||||
In a way, this is the same horizontal slicing as tensor parallelism, as opposed to Vertical
|
||||
slicing, where one puts whole layer-groups on different GPUs. Now let's see how this works:
|
||||
|
||||
Each of these GPUs will get the usual mini-batch as it works in DP:
|
||||
|
||||
Now each of these GPUs will get the usual mini-batch as it works in DP:
|
||||
```
|
||||
x0 => GPU0
|
||||
x1 => GPU1
|
||||
x2 => GPU2
|
||||
```
|
||||
|
||||
The inputs are unmodified - they think they are going to be processed by the normal model.
|
||||
The inputs are passed without modifications as if they would be processed by the original model.
|
||||
|
||||
First, the inputs hit the layer La.
|
||||
First, the inputs get to the layer `La`. What happens at this point?
|
||||
|
||||
Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together.
|
||||
On GPU0: the x0 mini-batch requires the a0, a1, a2 parameters to do its forward path through the layer, but the GPU0 has only a0.
|
||||
It will get a1 from GPU1 and a2 from GPU2, bringing all the pieces of the model together.
|
||||
|
||||
In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2.
|
||||
In parallel, GPU1 gets another mini-batch - x1. GPU1 has the a1 parameter, but needs a0 and a2, so it gets those from GPU0 and GPU2.
|
||||
Same happens to GPU2 that gets the mini-batch x2. It gets a0 and a1 from GPU0 and GPU1.
|
||||
|
||||
Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor.
|
||||
This way each of the 3 GPUs gets the full tensors reconstructed and makes a forward pass with its own mini-batch.
|
||||
As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation.
|
||||
The reconstruction is done efficiently via a pre-fetch.
|
||||
|
||||
All 3 GPUs get the full tensors reconstructed and a forward happens.
|
||||
Then the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
|
||||
|
||||
As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch.
|
||||
<Tip>
|
||||
|
||||
And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
|
||||
This mechanism is similar to an efficient group backpacking strategy: person A carries the tent, person B carries the stove,
|
||||
and person C carries the axe. Each night they all share what they have with others and get from others what they don't have,
|
||||
and in the morning they pack up their allocated type of gear and continue on their way. This is what ZeRO DP/Sharded DDP is.
|
||||
Compare this strategy to the simple one where each person has to carry their own tent, stove and axe (similar to
|
||||
DataParallel (DP and DDP) in PyTorch), which would be far more inefficient.
|
||||
|
||||
To me this sounds like an efficient group backpacking weight distribution strategy:
|
||||
|
||||
1. person A carries the tent
|
||||
2. person B carries the stove
|
||||
3. person C carries the axe
|
||||
|
||||
Now each night they all share what they have with others and get from others what they don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.
|
||||
|
||||
Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch.
|
||||
</Tip>
|
||||
|
||||
While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned.
|
||||
|
||||
If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next.
|
||||
If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism
|
||||
which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism
|
||||
which is discussed next.
|
||||
|
||||
Implementations:
|
||||
|
||||
- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3
|
||||
- [`Accelerate` integration](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed)
|
||||
- [`transformers` integration](main_classes/trainer#trainer-integrations)
|
||||
|
||||
## Naive Model Parallelism (Vertical) and Pipeline Parallelism
|
||||
## From Naive Model Parallelism to Pipeline Parallelism
|
||||
|
||||
Naive Model Parallelism (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified.
|
||||
To explain Pipeline parallelism, we'll first look into Naive Model Parallelism (MP), also known as Vertical MP. This approach
|
||||
involves distributing groups of model layers across multiple GPUs by assigning specific layers to specific GPUs with `.to()`.
|
||||
As data flows through these layers, it is moved to the same GPU as the layer, while the other layers remain untouched.
|
||||
|
||||
We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model:
|
||||
We refer to this Model parallelism as "Vertical" because of how models are typically visualized. For example, the
|
||||
following diagram shows an 8-layer model split vertically into two slices, placing layers 0-3 onto
|
||||
GPU0 and 4-7 to GPU1:
|
||||
|
||||
```
|
||||
=================== ===================
|
||||
| 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 |
|
||||
=================== ===================
|
||||
gpu0 gpu1
|
||||
GPU0 GPU1
|
||||
```
|
||||
we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1.
|
||||
|
||||
Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger.
|
||||
In this example, when data moves from layer 0 to 3, it's no different from regular forward pass. However, passing data
|
||||
from layer 3 to 4 requires moving it from GPU0 to GPU1, introducing a communication overhead. If the participating
|
||||
GPUs are on the same compute node (e.g. same physical machine) this copying is fast, but if the GPUs are distributed
|
||||
across different compute nodes (e.g. multiple machines), the communication overhead could be substantially greater.
|
||||
|
||||
Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be computed and the optimizer can do its work.
|
||||
Following that, layers 4 to 7 work as they would in the original model. Upon completion of the 7th layer, there is often
|
||||
a need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be
|
||||
computed and the optimizer can do its work.
|
||||
|
||||
Problems:
|
||||
- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
|
||||
- shared embeddings may need to get copied back and forth between GPUs.
|
||||
Naive Model Parallelism comes several shortcomings:
|
||||
- **All but one GPU are idle at any given moment**: if 4 GPUs are used, it's nearly identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware.
|
||||
- **Overhead in data transfer between devices**: E.g. 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, but a single 24GB card will complete the training faster, because it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
|
||||
- **Copying shared embeddings**: Shared embeddings may need to get copied back and forth between GPUs.
|
||||
|
||||
Pipeline Parallelism (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
|
||||
Now that you are familiar with how the naive approach to model parallelism works and its shortcomings, let's look at Pipeline Parallelism (PP).
|
||||
PP is almost identical to a naive MP, but it solves the GPU idling problem by chunking the incoming batch into micro-batches
|
||||
and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
|
||||
|
||||
The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom:
|
||||
The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html)
|
||||
shows the naive MP on the top, and PP on the bottom:
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png" alt="MP vs PP"/>
|
||||
</div>
|
||||
|
||||
It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble".
|
||||
At the bottom of the diagram, you can observe that the Pipeline Parallelism (PP) approach minimizes the number of idle
|
||||
GPU zones, referred to as 'bubbles'. Both parts of the diagram show a parallelism level of degree 4, meaning that 4 GPUs
|
||||
are involved in the pipeline. You can see that there's a forward path of 4 pipe stages (F0, F1, F2 and F3) followed by
|
||||
a backward path in reverse order (B3, B2, B1, and B0).
|
||||
|
||||
Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
|
||||
PP introduces a new hyperparameter to tune - `chunks`, which determines how many data chunks are sent in a sequence
|
||||
through the same pipe stage. For example, in the bottom diagram you can see `chunks=4`. GPU0 performs the same
|
||||
forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do complete their work.
|
||||
Only when the other GPUs begin to complete their work, GPU0 starts to work again doing the backward path for chunks
|
||||
3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
|
||||
|
||||
PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
|
||||
Note that this is the same concept as gradient accumulation steps. PyTorch uses `chunks`, while DeepSpeed refers
|
||||
to the same hyperparameter as gradient accumulation steps.
|
||||
|
||||
Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
|
||||
Because of the chunks, PP introduces the notion of micro-batches (MBS). DP splits the global data batch size into
|
||||
mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of
|
||||
256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each
|
||||
Pipeline stage works with a single micro-batch at a time. To calculate the global batch size of the DP + PP setup,
|
||||
use the formula: `mbs * chunks * dp_degree` (`8 * 32 * 4 = 1024`).
|
||||
With `chunks=1` you end up with the naive MP, which is inefficient. With a large `chunks` value you end up with
|
||||
tiny micro-batch sizes which is also inefficient. For this reason, we encourage to experiment with the `chunks` value to
|
||||
find the one that leads to the most efficient GPUs utilization.
|
||||
|
||||
Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time.
|
||||
You may notice a bubble of "dead" time on the diagram that can't be parallelized because the last `forward` stage
|
||||
has to wait for `backward` to complete the pipeline. The purpose of finding the best value for `chunks` is to enable a high
|
||||
concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
|
||||
|
||||
To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`).
|
||||
|
||||
Let's go back to the diagram.
|
||||
|
||||
With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus.
|
||||
|
||||
While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
|
||||
|
||||
There are 2 groups of solutions - the traditional Pipeline API and the more modern solutions that make things much easier for the end user.
|
||||
|
||||
Traditional Pipeline API solutions:
|
||||
Pipeline API solutions have been implemented in:
|
||||
- PyTorch
|
||||
- DeepSpeed
|
||||
- Megatron-LM
|
||||
|
||||
Modern solutions:
|
||||
These come with some shortcomings:
|
||||
- They have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
|
||||
- Currently the Pipeline API is very restricted. If you had a bunch of Python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
|
||||
- Conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
|
||||
- They have to arrange each layer so that the output of one layer becomes an input to the other layer.
|
||||
|
||||
More recent solutions include:
|
||||
- Varuna
|
||||
- Sagemaker
|
||||
|
||||
Problems with traditional Pipeline API solutions:
|
||||
- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
|
||||
- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
|
||||
- conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
|
||||
- have to arrange each layer so that the output of one model becomes an input to the other model.
|
||||
|
||||
We are yet to experiment with Varuna and SageMaker but their papers report that they have overcome the list of problems mentioned above and that they require much smaller changes to the user's model.
|
||||
We have not experimented with Varuna and SageMaker but their papers report that they have overcome the list of problems
|
||||
mentioned above and that they require smaller changes to the user's model.
|
||||
|
||||
Implementations:
|
||||
- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
|
||||
- [PyTorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
|
||||
- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
|
||||
- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
|
||||
- [Varuna](https://github.com/microsoft/varuna)
|
||||
- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
|
||||
- [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers.
|
||||
|
||||
ЁЯдЧ Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
|
||||
ЁЯдЧ Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support.
|
||||
The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This
|
||||
is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
|
||||
|
||||
DeepSpeed and Megatron-LM integrations are available in [ЁЯдЧ Accelerate](https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed)
|
||||
|
||||
Other approaches:
|
||||
|
||||
DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
|
||||

|
||||
|
||||
Here the bubble (idle time) is further minimized by prioritizing backward passes.
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png" alt="Interleaved pipeline execution"/>
|
||||
</div>
|
||||
|
||||
Varuna further tries to improve the schedule by using simulations to discover the most efficient scheduling.
|
||||
Here the bubble (idle time) is further minimized by prioritizing backward passes. Varuna further attempts to improve the
|
||||
schedule by using simulations to discover the most efficient scheduling.
|
||||
|
||||
OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` converting.
|
||||
OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` conversion.
|
||||
|
||||
## Tensor Parallelism
|
||||
|
||||
In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing.
|
||||
|
||||
In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
|
||||
In Tensor Parallelism, each GPU processes a slice of a tensor and only aggregates the full tensor for operations requiring it.
|
||||
To describe this method, this section of the guide relies on the concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
|
||||
paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
|
||||
|
||||
The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`.
|
||||
The dot dot-product part of it, following the Megatron's paper notation, can be written as `Y = GeLU(XA)`, where `X` is
|
||||
an input vector, `Y` is the output vector, and `A` is the weight matrix.
|
||||
|
||||
Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix.
|
||||
If we look at the computation in matrix form, you can see how the matrix multiplication can be split between multiple GPUs:
|
||||
|
||||
If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs:
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png" alt="Parallel GEMM"/>
|
||||
</div>
|
||||
|
||||
If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
|
||||

|
||||
If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel,
|
||||
then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
|
||||
|
||||
Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that:
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png" alt="Independent GeLU"/>
|
||||
</div>
|
||||
|
||||
Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads!
|
||||

|
||||
Using this principle, we can update a multi-layer perceptron of arbitrary depth, without the need for any synchronization
|
||||
between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors
|
||||
provide a helpful illustration for that:
|
||||
|
||||
Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs.
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png" alt="Parallel shard processing"/>
|
||||
</div>
|
||||
|
||||
Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having
|
||||
multiple independent heads!
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png" alt="Parallel self-attention"/>
|
||||
</div>
|
||||
|
||||
Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node.
|
||||
Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use
|
||||
nodes that have at least 8 GPUs.
|
||||
|
||||
This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
|
||||
by [@anton-l](https://github.com/anton-l).
|
||||
|
||||
SageMaker combines TP with DP for a more efficient processing.
|
||||
|
||||
Alternative names:
|
||||
- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)
|
||||
|
||||
@ -367,18 +442,27 @@ Implementations:
|
||||
- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
|
||||
- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers.
|
||||
|
||||
SageMaker combines TP with DP for a more efficient processing.
|
||||
|
||||
ЁЯдЧ Transformers status:
|
||||
- core: not yet implemented in the core
|
||||
- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
|
||||
- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)
|
||||
|
||||
## DP+PP
|
||||
ЁЯдЧ Accelerate integrates with [TP from Megatron-LM](https://huggingface.co/docs/accelerate/v0.23.0/en/usage_guides/megatron_lm).
|
||||
|
||||
The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP.
|
||||
## Data Parallelism + Pipeline Parallelism
|
||||
|
||||

|
||||
The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates
|
||||
how one can combine DP with PP.
|
||||
|
||||
Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid.
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png" alt="DP + PP-2d"/>
|
||||
</div>
|
||||
|
||||
Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0
|
||||
and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP.
|
||||
And GPU1 does the same by enlisting GPU3 to its aid.
|
||||
|
||||
Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.
|
||||
|
||||
@ -391,11 +475,13 @@ Implementations:
|
||||
|
||||
ЁЯдЧ Transformers status: not yet implemented
|
||||
|
||||
## DP+PP+TP
|
||||
## Data Parallelism + Pipeline Parallelism + Tensor Parallelism
|
||||
|
||||
To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram.
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png" alt="dp-pp-tp-3d"/>
|
||||
</div>
|
||||
|
||||
This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well.
|
||||
|
||||
@ -410,15 +496,22 @@ Implementations:
|
||||
|
||||
ЁЯдЧ Transformers status: not yet implemented, since we have no PP and TP.
|
||||
|
||||
## ZeRO DP+PP+TP
|
||||
## ZeRO Data Parallelism + Pipeline Parallelism + Tensor Parallelism
|
||||
|
||||
One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP.
|
||||
One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been
|
||||
discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP.
|
||||
But it can be combined with PP and TP.
|
||||
|
||||
When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding).
|
||||
|
||||
While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt.
|
||||
While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have negative
|
||||
performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate
|
||||
the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism,
|
||||
small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with
|
||||
minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to impact the performance.
|
||||
|
||||
In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
|
||||
In addition, there are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already
|
||||
reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
|
||||
|
||||
ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required.
|
||||
|
||||
@ -455,7 +548,9 @@ Let's take 10 batches of sequence length 512. If we parallelize them by sample d
|
||||
|
||||
* Operator
|
||||
|
||||
If we perform layer normalization, we compute std first and mean second, and then we can normalize data. Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.
|
||||
If we perform layer normalization, we compute std first and mean second, and then we can normalize data.
|
||||
Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2
|
||||
devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.
|
||||
|
||||
* Attribute
|
||||
|
||||
@ -465,66 +560,20 @@ We have 10 batches of 512 length. If we parallelize them by attribute dimension
|
||||
|
||||
It is similar with tensor model parallelism or naive layer-wise model parallelism.
|
||||
|
||||

|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg" alt="flex-flow-soap"/>
|
||||
</div>
|
||||
|
||||
The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which parallelisation to use where.
|
||||
The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3)
|
||||
fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which
|
||||
parallelisation to use where.
|
||||
|
||||
One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
|
||||
One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and
|
||||
fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
|
||||
|
||||
So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization.
|
||||
So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best
|
||||
strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan
|
||||
for that. And then you can train. A different setup will have its own custom optimization.
|
||||
|
||||
ЁЯдЧ Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
|
||||
|
||||
|
||||
## Which Strategy To Use When
|
||||
|
||||
Here is a very rough outline at which parallelism strategy to use when. The first on each list is typically faster.
|
||||
|
||||
**тЗи Single GPU**
|
||||
|
||||
* Model fits onto a single GPU:
|
||||
|
||||
1. Normal use
|
||||
|
||||
* Model doesn't fit onto a single GPU:
|
||||
|
||||
1. ZeRO + Offload CPU and optionally NVMe
|
||||
2. as above plus Memory Centric Tiling (see below for details) if the largest layer can't fit into a single GPU
|
||||
|
||||
* Largest Layer not fitting into a single GPU:
|
||||
|
||||
1. ZeRO - Enable [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT). It allows you to run arbitrarily large layers by automatically splitting them and executing them sequentially. MCT reduces the number of parameters that are live on a GPU, but it does not affect the activation memory. As this need is very rare as of this writing a manual override of `torch.nn.Linear` needs to be done by the user.
|
||||
|
||||
**тЗи Single Node / Multi-GPU**
|
||||
|
||||
* Model fits onto a single GPU:
|
||||
|
||||
1. DDP - Distributed DP
|
||||
2. ZeRO - may or may not be faster depending on the situation and configuration used
|
||||
|
||||
* Model doesn't fit onto a single GPU:
|
||||
|
||||
1. PP
|
||||
2. ZeRO
|
||||
3. TP
|
||||
|
||||
With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
|
||||
|
||||
TP is almost always used within a single node. That is TP size <= gpus per node.
|
||||
|
||||
* Largest Layer not fitting into a single GPU:
|
||||
|
||||
1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
|
||||
2. With ZeRO see the same entry for "Single GPU" above
|
||||
|
||||
|
||||
**тЗи Multi-Node / Multi-GPU**
|
||||
|
||||
* When you have fast inter-node connectivity:
|
||||
|
||||
1. ZeRO - as it requires close to no modifications to the model
|
||||
2. PP+TP+DP - less communications, but requires massive changes to the model
|
||||
|
||||
* when you have slow inter-node connectivity and still low on GPU memory:
|
||||
|
||||
1. DP+PP+TP+ZeRO-1
|
||||
ЁЯдЧ Transformers status: Transformers models are FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py),
|
||||
which is a prerequisite for FlexFlow, however, changes are required on the FlexFlow side to make it work with Transformers models.
|
||||
|
||||
@ -228,6 +228,10 @@ For additional information on tf32 vs other precisions, please refer to the foll
|
||||
[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and
|
||||
[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189).
|
||||
|
||||
## Flash Attention 2
|
||||
|
||||
You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) to learn more about how to load a model with Flash Attention 2 modules.
|
||||
|
||||
## Optimizer choice
|
||||
|
||||
The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves
|
||||
|
||||
@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se
|
||||
|
||||
* [Inference on a single CPU](perf_infer_cpu)
|
||||
* [Inference on a single GPU](perf_infer_gpu_one)
|
||||
* [Multi-GPU inference](perf_infer_gpu_many)
|
||||
* [Multi-GPU inference](perf_infer_gpu_one)
|
||||
* [XLA Integration for TensorFlow Models](tf_xla)
|
||||
|
||||
|
||||
|
||||
@ -30,33 +30,44 @@ Take a look at the [`pipeline`] documentation for a complete list of supported t
|
||||
|
||||
## Pipeline usage
|
||||
|
||||
While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable of inference for your task.
|
||||
While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains
|
||||
all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable
|
||||
of inference for your task. Let's take the example of using the [`pipeline`] for automatic speech recognition (ASR), or
|
||||
speech-to-text.
|
||||
|
||||
1. Start by creating a [`pipeline`] and specify an inference task:
|
||||
|
||||
1. Start by creating a [`pipeline`] and specify the inference task:
|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> generator = pipeline(task="automatic-speech-recognition")
|
||||
>>> transcriber = pipeline(task="automatic-speech-recognition")
|
||||
```
|
||||
|
||||
2. Pass your input text to the [`pipeline`]:
|
||||
2. Pass your input to the [`pipeline`]. In the case of speech recognition, this is an audio input file:
|
||||
|
||||
```py
|
||||
>>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
|
||||
```
|
||||
|
||||
Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads) on the Hub to see if you can get a better transcription.
|
||||
Let's try [openai/whisper-large](https://huggingface.co/openai/whisper-large):
|
||||
Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending)
|
||||
on the Hub to see if you can get a better transcription.
|
||||
|
||||
Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large) model from OpenAI. Whisper was released
|
||||
2 years later than Wav2Vec2, and was trained on close to 10x more data. As such, it beats Wav2Vec2 on most downstream
|
||||
benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with
|
||||
Wav2Vec2.
|
||||
|
||||
Let's give it a try here to see how it performs:
|
||||
|
||||
```py
|
||||
>>> generator = pipeline(model="openai/whisper-large")
|
||||
>>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
>>> transcriber = pipeline(model="openai/whisper-large-v2")
|
||||
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
|
||||
```
|
||||
|
||||
Now this result looks more accurate!
|
||||
Now this result looks more accurate! For a deep-dive comparison on Wav2Vec2 vs Whisper, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/asr_models).
|
||||
We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more.
|
||||
You can check out and compare model results directly from your browser on the Hub to see if it fits or
|
||||
handles corner cases better than other ones.
|
||||
@ -65,7 +76,7 @@ And if you don't find a model for your use case, you can always start [training]
|
||||
If you have several inputs, you can pass your input as a list:
|
||||
|
||||
```py
|
||||
generator(
|
||||
transcriber(
|
||||
[
|
||||
"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
|
||||
"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
|
||||
@ -73,22 +84,22 @@ generator(
|
||||
)
|
||||
```
|
||||
|
||||
If you want to iterate over a whole dataset, or want to use it for inference in a webserver, check out dedicated parts
|
||||
|
||||
[Using pipelines on a dataset](#using-pipelines-on-a-dataset)
|
||||
|
||||
[Using pipelines for a webserver](./pipeline_webserver)
|
||||
Pipelines are great for experimentation as switching from one model to another is trivial; however, there are some ways to optimize them for larger workloads than experimentation. See the following guides that dive into iterating over whole datasets or using pipelines in a webserver:
|
||||
of the docs:
|
||||
* [Using pipelines on a dataset](#using-pipelines-on-a-dataset)
|
||||
* [Using pipelines for a webserver](./pipeline_webserver)
|
||||
|
||||
## Parameters
|
||||
|
||||
[`pipeline`] supports many parameters; some are task specific, and some are general to all pipelines.
|
||||
In general you can specify parameters anywhere you want:
|
||||
In general, you can specify parameters anywhere you want:
|
||||
|
||||
```py
|
||||
generator = pipeline(model="openai/whisper-large", my_parameter=1)
|
||||
out = generator(...) # This will use `my_parameter=1`.
|
||||
out = generator(..., my_parameter=2) # This will override and use `my_parameter=2`.
|
||||
out = generator(...) # This will go back to using `my_parameter=1`.
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
|
||||
|
||||
out = transcriber(...) # This will use `my_parameter=1`.
|
||||
out = transcriber(..., my_parameter=2) # This will override and use `my_parameter=2`.
|
||||
out = transcriber(...) # This will go back to using `my_parameter=1`.
|
||||
```
|
||||
|
||||
Let's check out 3 important ones:
|
||||
@ -99,14 +110,21 @@ If you use `device=n`, the pipeline automatically puts the model on the specifie
|
||||
This will work regardless of whether you are using PyTorch or Tensorflow.
|
||||
|
||||
```py
|
||||
generator = pipeline(model="openai/whisper-large", device=0)
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", device=0)
|
||||
```
|
||||
|
||||
If the model is too large for a single GPU, you can set `device_map="auto"` to allow ЁЯдЧ [Accelerate](https://huggingface.co/docs/accelerate) to automatically determine how to load and store the model weights.
|
||||
If the model is too large for a single GPU and you are using PyTorch, you can set `device_map="auto"` to automatically
|
||||
determine how to load and store the model weights. Using the `device_map` argument requires the ЁЯдЧ [Accelerate](https://huggingface.co/docs/accelerate)
|
||||
package:
|
||||
|
||||
```bash
|
||||
pip install --upgrade accelerate
|
||||
```
|
||||
|
||||
The following code automatically loads and stores model weights across devices:
|
||||
|
||||
```py
|
||||
#!pip install accelerate
|
||||
generator = pipeline(model="openai/whisper-large", device_map="auto")
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
|
||||
```
|
||||
|
||||
Note that if `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!
|
||||
@ -118,12 +136,12 @@ By default, pipelines will not batch inference for reasons explained in detail [
|
||||
But if it works in your use case, you can use:
|
||||
|
||||
```py
|
||||
generator = pipeline(model="openai/whisper-large", device=0, batch_size=2)
|
||||
audio_filenames = [f"audio_{i}.flac" for i in range(10)]
|
||||
texts = generator(audio_filenames)
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
|
||||
audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
|
||||
texts = transcriber(audio_filenames)
|
||||
```
|
||||
|
||||
This runs the pipeline on the 10 provided audio files, but it will pass them in batches of 2
|
||||
This runs the pipeline on the 4 provided audio files, but it will pass them in batches of 2
|
||||
to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you.
|
||||
The output should always match what you would have received without batching. It is only meant as a way to help you get more speed out of a pipeline.
|
||||
|
||||
@ -136,18 +154,23 @@ For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] m
|
||||
|
||||
|
||||
```py
|
||||
>>> # Not using whisper, as it cannot provide timestamps.
|
||||
>>> generator = pipeline(model="facebook/wav2vec2-large-960h-lv60-self", return_timestamps="word")
|
||||
>>> generator("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP AND LIVE OUT THE TRUE MEANING OF ITS CREED', 'chunks': [{'text': 'I', 'timestamp': (1.22, 1.24)}, {'text': 'HAVE', 'timestamp': (1.42, 1.58)}, {'text': 'A', 'timestamp': (1.66, 1.68)}, {'text': 'DREAM', 'timestamp': (1.76, 2.14)}, {'text': 'BUT', 'timestamp': (3.68, 3.8)}, {'text': 'ONE', 'timestamp': (3.94, 4.06)}, {'text': 'DAY', 'timestamp': (4.16, 4.3)}, {'text': 'THIS', 'timestamp': (6.36, 6.54)}, {'text': 'NATION', 'timestamp': (6.68, 7.1)}, {'text': 'WILL', 'timestamp': (7.32, 7.56)}, {'text': 'RISE', 'timestamp': (7.8, 8.26)}, {'text': 'UP', 'timestamp': (8.38, 8.48)}, {'text': 'AND', 'timestamp': (10.08, 10.18)}, {'text': 'LIVE', 'timestamp': (10.26, 10.48)}, {'text': 'OUT', 'timestamp': (10.58, 10.7)}, {'text': 'THE', 'timestamp': (10.82, 10.9)}, {'text': 'TRUE', 'timestamp': (10.98, 11.18)}, {'text': 'MEANING', 'timestamp': (11.26, 11.58)}, {'text': 'OF', 'timestamp': (11.66, 11.7)}, {'text': 'ITS', 'timestamp': (11.76, 11.88)}, {'text': 'CREED', 'timestamp': (12.0, 12.38)}]}
|
||||
>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
|
||||
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
|
||||
```
|
||||
|
||||
As you can see, the model inferred the text and also outputted **when** the various words were pronounced
|
||||
in the sentence.
|
||||
As you can see, the model inferred the text and also outputted **when** the various sentences were pronounced.
|
||||
|
||||
There are many parameters available for each task, so check out each task's API reference to see what you can tinker with!
|
||||
For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically cannot handle on its own.
|
||||
For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful
|
||||
for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically
|
||||
cannot handle on its own:
|
||||
|
||||
```python
|
||||
>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True)
|
||||
>>> transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav")
|
||||
{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. It was the day after the evening...
|
||||
```
|
||||
|
||||
If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
|
||||
|
||||
|
||||
@ -124,6 +124,7 @@ This checks that:
|
||||
- The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`)
|
||||
- The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`)
|
||||
- The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`)
|
||||
- All docstrings properly document the arguments in the signature of the object (performed by `utils/check_docstrings.py`)
|
||||
|
||||
Should this check fail, the first two items require manual fixing, the last four can be fixed automatically for you by running the command
|
||||
|
||||
|
||||
@ -306,7 +306,7 @@ Create a function to preprocess the dataset so the audio samples are the same le
|
||||
... return inputs
|
||||
```
|
||||
|
||||
Apply the `preprocess_function` to the the first few examples in the dataset:
|
||||
Apply the `preprocess_function` to the first few examples in the dataset:
|
||||
|
||||
```py
|
||||
>>> processed_dataset = preprocess_function(dataset[:5])
|
||||
@ -412,8 +412,7 @@ If you wish to normalize images as a part of the augmentation transformation, us
|
||||
and `image_processor.image_std` values.
|
||||
</Tip>
|
||||
|
||||
3. Then use ЁЯдЧ Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) to apply the transforms on the fly:
|
||||
|
||||
3. Then use ЁЯдЧ Datasets[`~datasets.Dataset.set_transform`] to apply the transforms on the fly:
|
||||
```py
|
||||
>>> dataset.set_transform(transforms)
|
||||
```
|
||||
|
||||
@ -276,7 +276,7 @@ We can instruct the model to classify the image into one of the categories that
|
||||
>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
|
||||
>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
|
||||
|
||||
>>> generated_ids = model.generate(**inputs, max_new_tokens=4, bad_words_ids=bad_words_ids)
|
||||
>>> generated_ids = model.generate(**inputs, max_new_tokens=6, bad_words_ids=bad_words_ids)
|
||||
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
>>> print(generated_text[0])
|
||||
Instruction: Classify the following image into a single category from the following list: ['animals', 'vegetables', 'city landscape', 'cars', 'office'].
|
||||
@ -357,7 +357,7 @@ for a batch of examples by passing a list of prompts:
|
||||
... ],
|
||||
... ]
|
||||
|
||||
>>> inputs = processor(prompts, return_tensors="pt")
|
||||
>>> inputs = processor(prompts, return_tensors="pt").to("cuda")
|
||||
>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
|
||||
|
||||
>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
|
||||
|
||||
132
docs/source/en/tasks/image_to_image.md
Normal file
132
docs/source/en/tasks/image_to_image.md
Normal file
@ -0,0 +1,132 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Image-to-Image Task Guide
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Image-to-Image task is the task where an application receives an image and outputs another image. This has various subtasks, including image enhancement (super resolution, low light enhancement, deraining and so on), image inpainting, and more.
|
||||
|
||||
This guide will show you how to:
|
||||
- Use an image-to-image pipeline for super resolution task,
|
||||
- Run image-to-image models for same task without a pipeline.
|
||||
|
||||
Note that as of the time this guide is released, `image-to-image` pipeline only supports super resolution task.
|
||||
|
||||
Let's begin by installing the necessary libraries.
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
We can now initialize the pipeline with a [Swin2SR model](https://huggingface.co/caidas/swin2SR-lightweight-x2-64). We can then infer with the pipeline by calling it with an image. As of now, only [Swin2SR models](https://huggingface.co/models?sort=trending&search=swin2sr) are supported in this pipeline.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device)
|
||||
```
|
||||
|
||||
Now, let's load an image.
|
||||
|
||||
```python
|
||||
from PIL import Image
|
||||
import requests
|
||||
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
print(image.size)
|
||||
```
|
||||
```bash
|
||||
# (532, 432)
|
||||
```
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat.jpg" alt="Photo of a cat"/>
|
||||
</div>
|
||||
|
||||
We can now do inference with the pipeline. We will get an upscaled version of the cat image.
|
||||
|
||||
```python
|
||||
upscaled = pipe(image)
|
||||
print(upscaled.size)
|
||||
```
|
||||
```bash
|
||||
# (1072, 880)
|
||||
```
|
||||
|
||||
If you wish to do inference yourself with no pipeline, you can use the `Swin2SRForImageSuperResolution` and `Swin2SRImageProcessor` classes of transformers. We will use the same model checkpoint for this. Let's initialize the model and the processor.
|
||||
|
||||
```python
|
||||
from transformers import Swin2SRForImageSuperResolution, Swin2SRImageProcessor
|
||||
|
||||
model = Swin2SRForImageSuperResolution.from_pretrained("caidas/swin2SR-lightweight-x2-64").to(device)
|
||||
processor = Swin2SRImageProcessor("caidas/swin2SR-lightweight-x2-64")
|
||||
```
|
||||
|
||||
`pipeline` abstracts away the preprocessing and postprocessing steps that we have to do ourselves, so let's preprocess the image. We will pass the image to the processor and then move the pixel values to GPU.
|
||||
|
||||
```python
|
||||
pixel_values = processor(image, return_tensors="pt").pixel_values
|
||||
print(pixel_values.shape)
|
||||
|
||||
pixel_values = pixel_values.to(device)
|
||||
```
|
||||
|
||||
We can now infer the image by passing pixel values to the model.
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(pixel_values)
|
||||
```
|
||||
Output is an object of type `ImageSuperResolutionOutput` that looks like below ЁЯСЗ
|
||||
|
||||
```
|
||||
(loss=None, reconstruction=tensor([[[[0.8270, 0.8269, 0.8275, ..., 0.7463, 0.7446, 0.7453],
|
||||
[0.8287, 0.8278, 0.8283, ..., 0.7451, 0.7448, 0.7457],
|
||||
[0.8280, 0.8273, 0.8269, ..., 0.7447, 0.7446, 0.7452],
|
||||
...,
|
||||
[0.5923, 0.5933, 0.5924, ..., 0.0697, 0.0695, 0.0706],
|
||||
[0.5926, 0.5932, 0.5926, ..., 0.0673, 0.0687, 0.0705],
|
||||
[0.5927, 0.5914, 0.5922, ..., 0.0664, 0.0694, 0.0718]]]],
|
||||
device='cuda:0'), hidden_states=None, attentions=None)
|
||||
```
|
||||
We need to get the `reconstruction` and post-process it for visualization. Let's see how it looks like.
|
||||
|
||||
```python
|
||||
outputs.reconstruction.data.shape
|
||||
# torch.Size([1, 3, 880, 1072])
|
||||
```
|
||||
|
||||
We need to squeeze the output and get rid of axis 0, clip the values, then convert it to be numpy float. Then we will arrange axes to have the shape [1072, 880], and finally, bring the output back to range [0, 255].
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
# squeeze, take to CPU and clip the values
|
||||
output = outputs.reconstruction.data.squeeze().cpu().clamp_(0, 1).numpy()
|
||||
# rearrange the axes
|
||||
output = np.moveaxis(output, source=0, destination=-1)
|
||||
# bring values back to pixel values range
|
||||
output = (output * 255.0).round().astype(np.uint8)
|
||||
Image.fromarray(output)
|
||||
```
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat_upscaled.png" alt="Upscaled photo of a cat"/>
|
||||
</div>
|
||||
@ -0,0 +1,186 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
# Knowledge Distillation for Computer Vision
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between it's outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this.
|
||||
|
||||
This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer┬аAPI](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of ЁЯдЧ Transformers.
|
||||
|
||||
Let's install the libraries needed for distillation and evaluating the process.
|
||||
|
||||
```bash
|
||||
pip install transformers datasets accelerate tensorboard evaluate --upgrade
|
||||
```
|
||||
|
||||
In this example, we are using the `merve/beans-vit-224` model as teacher model. It's an image classification model, based on `google/vit-base-patch16-224-in21k` fine-tuned on beans dataset. We will distill this model to a randomly initialized MobileNetV2.
|
||||
|
||||
We will now load the dataset.
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("beans")
|
||||
```
|
||||
|
||||
We can use an image processor from either of the models, as in this case they return the same output with same resolution. We will use the `map()` method of `dataset` to apply the preprocessing to every split of the dataset.
|
||||
|
||||
```python
|
||||
from transformers import AutoImageProcessor
|
||||
teacher_processor = AutoImageProcessor.from_pretrained("merve/beans-vit-224")
|
||||
|
||||
def process(examples):
|
||||
processed_inputs = teacher_processor(examples["image"])
|
||||
return processed_inputs
|
||||
|
||||
processed_datasets = dataset.map(process, batched=True)
|
||||
```
|
||||
|
||||
Essentially, we want the student model (a randomly initialized MobileNet) to mimic the teacher model (fine-tuned vision transformer). To achieve this, we first get the logits output from the teacher and the student. Then, we divide each of them by the parameter `temperature` which controls the importance of each soft target. A parameter called `lambda` weighs the importance of the distillation loss. In this example, we will use `temperature=5` and `lambda=0.5`. We will use the Kullback-Leibler Divergence loss to compute the divergence between the student and teacher. Given two data P and Q, KL Divergence explains how much extra information we need to represent P using Q. If two are identical, their KL divergence is zero, as there's no other information needed to explain P from Q. Thus, in the context of knowledge distillation, KL divergence is useful.
|
||||
|
||||
|
||||
```python
|
||||
from transformers import TrainingArguments, Trainer
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class ImageDistilTrainer(Trainer):
|
||||
def __init__(self, *args, teacher_model=None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.teacher = teacher_model
|
||||
self.student = student_model
|
||||
self.loss_function = nn.KLDivLoss(reduction="batchmean")
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
self.teacher.to(device)
|
||||
self.teacher.eval()
|
||||
self.temperature = temperature
|
||||
self.lambda_param = lambda_param
|
||||
|
||||
def compute_loss(self, student, inputs, return_outputs=False):
|
||||
student_output = self.student(**inputs)
|
||||
|
||||
with torch.no_grad():
|
||||
teacher_output = self.teacher(**inputs)
|
||||
|
||||
# Compute soft targets for teacher and student
|
||||
soft_teacher = F.softmax(teacher_output.logits / self.temperature, dim=-1)
|
||||
soft_student = F.log_softmax(student_output.logits / self.temperature, dim=-1)
|
||||
|
||||
# Compute the loss
|
||||
distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2)
|
||||
|
||||
# Compute the true label loss
|
||||
student_target_loss = student_output.loss
|
||||
|
||||
# Calculate final loss
|
||||
loss = (1. - self.lambda_param) * student_target_loss + self.lambda_param * distillation_loss
|
||||
return (loss, student_output) if return_outputs else loss
|
||||
```
|
||||
|
||||
We will now login to Hugging Face Hub so we can push our model to the Hugging Face Hub through the `Trainer`.
|
||||
|
||||
```python
|
||||
from huggingface_hub import notebook_login
|
||||
|
||||
notebook_login()
|
||||
```
|
||||
|
||||
Let's set the `TrainingArguments`, the teacher model and the student model.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForImageClassification, MobileNetV2Config, MobileNetV2ForImageClassification
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir="my-awesome-model",
|
||||
num_train_epochs=30,
|
||||
fp16=True,
|
||||
logging_dir=f"{repo_name}/logs",
|
||||
logging_strategy="epoch",
|
||||
evaluation_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
metric_for_best_model="accuracy",
|
||||
report_to="tensorboard",
|
||||
push_to_hub=True,
|
||||
hub_strategy="every_save",
|
||||
hub_model_id=repo_name,
|
||||
)
|
||||
|
||||
num_labels = len(processed_datasets["train"].features["labels"].names)
|
||||
|
||||
# initialize models
|
||||
teacher_model = AutoModelForImageClassification.from_pretrained(
|
||||
"merve/beans-vit-224",
|
||||
num_labels=num_labels,
|
||||
ignore_mismatched_sizes=True
|
||||
)
|
||||
|
||||
# training MobileNetV2 from scratch
|
||||
student_config = MobileNetV2Config()
|
||||
student_config.num_labels = num_labels
|
||||
student_model = MobileNetV2ForImageClassification(student_config)
|
||||
```
|
||||
|
||||
We can use `compute_metrics` function to evaluate our model on the test set. This function will be used during the training process to compute the `accuracy` & `f1` of our model.
|
||||
|
||||
```python
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
||||
accuracy = evaluate.load("accuracy")
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
predictions, labels = eval_pred
|
||||
acc = accuracy.compute(references=labels, predictions=np.argmax(predictions, axis=1))
|
||||
return {"accuracy": acc["accuracy"]}
|
||||
```
|
||||
|
||||
Let's initialize the `Trainer` with the training arguments we defined. We will also initialize our data collator.
|
||||
|
||||
```python
|
||||
from transformers import DefaultDataCollator
|
||||
|
||||
data_collator = DefaultDataCollator()
|
||||
trainer = ImageDistilTrainer(
|
||||
student_model=student_model,
|
||||
teacher_model=teacher_model,
|
||||
training_args=training_args,
|
||||
train_dataset=processed_datasets["train"],
|
||||
eval_dataset=processed_datasets["validation"],
|
||||
data_collator=data_collator,
|
||||
tokenizer=teacher_extractor,
|
||||
compute_metrics=compute_metrics,
|
||||
temperature=5,
|
||||
lambda_param=0.5
|
||||
)
|
||||
```
|
||||
|
||||
We can now train our model.
|
||||
|
||||
```python
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
We can evaluate the model on the test set.
|
||||
|
||||
```python
|
||||
trainer.evaluate(processed_datasets["test"])
|
||||
```
|
||||
|
||||
On test set, our model reaches 72 percent accuracy. To have a sanity check over efficiency of distillation, we also trained MobileNet on the beans dataset from scratch with the same hyperparameters and observed 63 percent accuracy on the test set. We invite the readers to try different pre-trained teacher models, student architectures, distillation parameters and report their findings. The training logs and checkpoints for distilled model can be found in [this repository](https://huggingface.co/merve/vit-mobilenet-beans-224), and MobileNetV2 trained from scratch can be found in this [repository](https://huggingface.co/merve/resnet-mobilenet-beans-5).
|
||||
@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
|
||||
Choose one of the following architectures:
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
|
||||
[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
|
||||
|
||||
|
||||
|
||||
|
||||
439
docs/source/en/tasks/prompting.md
Normal file
439
docs/source/en/tasks/prompting.md
Normal file
@ -0,0 +1,439 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
|
||||
# LLM prompting guide
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Large Language Models such as Falcon, LLaMA, etc. are pretrained transformer models initially trained to predict the
|
||||
next token given some input text. They typically have billions of parameters and have been trained on trillions of
|
||||
tokens for an extended period of time. As a result, these models become quite powerful and versatile, and you can use
|
||||
them to solve multiple NLP tasks out of the box by instructing the models with natural language prompts.
|
||||
|
||||
Designing such prompts to ensure the optimal output is often called "prompt engineering". Prompt engineering is an
|
||||
iterative process that requires a fair amount of experimentation. Natural languages are much more flexible and expressive
|
||||
than programming languages, however, they can also introduce some ambiguity. At the same time, prompts in natural language
|
||||
are quite sensitive to changes. Even minor modifications in prompts can lead to wildly different outputs.
|
||||
|
||||
While there is no exact recipe for creating prompts to match all cases, researchers have worked out a number of best
|
||||
practices that help to achieve optimal results more consistently.
|
||||
|
||||
This guide covers the prompt engineering best practices to help you craft better LLM prompts and solve various NLP tasks.
|
||||
You'll learn:
|
||||
|
||||
- [Basics of prompting](#basic-prompts)
|
||||
- [Best practices of LLM prompting](#best-practices-of-llm-prompting)
|
||||
- [Advanced prompting techniques: few-shot prompting and chain-of-thought](#advanced-prompting-techniques)
|
||||
- [When to fine-tune instead of prompting](#prompting-vs-fine-tuning)
|
||||
|
||||
<Tip>
|
||||
|
||||
Prompt engineering is only a part of the LLM output optimization process. Another essential component is choosing the
|
||||
optimal text generation strategy. You can customize how your LLM selects each of the subsequent tokens when generating
|
||||
the text without modifying any of the trainable parameters. By tweaking the text generation parameters, you can reduce
|
||||
repetition in the generated text and make it more coherent and human-sounding.
|
||||
Text generation strategies and parameters are out of scope for this guide, but you can learn more about these topics in
|
||||
the following guides:
|
||||
|
||||
* [Generation with LLMs](../llm_tutorial)
|
||||
* [Text generation strategies](../generation_strategies)
|
||||
|
||||
</Tip>
|
||||
|
||||
## Basics of prompting
|
||||
|
||||
### Types of models
|
||||
|
||||
The majority of modern LLMs are decoder-only transformers. Some examples include: [LLaMA](../model_doc/llama),
|
||||
[Llama2](../model_doc/llama2), [Falcon](../model_doc/falcon), [GPT2](../model_doc/gpt2). However, you may encounter
|
||||
encoder-decoder transformer LLMs as well, for instance, [Flan-T5](../model_doc/flan-t5) and [BART](../model_doc/bart).
|
||||
|
||||
Encoder-decoder-style models are typically used in generative tasks where the output **heavily** relies on the input, for
|
||||
example, in translation and summarization. The decoder-only models are used for all other types of generative tasks.
|
||||
|
||||
When using a pipeline to generate text with an LLM, it's important to know what type of LLM you are using, because
|
||||
they use different pipelines.
|
||||
|
||||
Run inference with decoder-only models with the `text-generation` pipeline:
|
||||
|
||||
```python
|
||||
>>> from transformers import pipeline
|
||||
>>> import torch
|
||||
|
||||
>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
|
||||
|
||||
>>> generator = pipeline('text-generation', model = 'gpt2')
|
||||
>>> prompt = "Hello, I'm a language model"
|
||||
|
||||
>>> generator(prompt, max_length = 30)
|
||||
[{'generated_text': "Hello, I'm a language model expert, so I'm a big believer in the concept that I know very well and then I try to look into"}]
|
||||
```
|
||||
|
||||
To run inference with an encoder-decoder, use the `text2text-generation` pipeline:
|
||||
|
||||
```python
|
||||
>>> text2text_generator = pipeline("text2text-generation", model = 'google/flan-t5-base')
|
||||
>>> prompt = "Translate from English to French: I'm very happy to see you"
|
||||
|
||||
>>> text2text_generator(prompt)
|
||||
[{'generated_text': 'Je suis tr├иs heureuse de vous rencontrer.'}]
|
||||
```
|
||||
|
||||
### Base vs instruct/chat models
|
||||
|
||||
Most of the recent LLM checkpoints available on ЁЯдЧ Hub come in two versions: base and instruct (or chat). For example,
|
||||
[`tiiuae/falcon-7b`](https://huggingface.co/tiiuae/falcon-7b) and [`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b-instruct).
|
||||
|
||||
Base models are excellent at completing the text when given an initial prompt, however, they are not ideal for NLP tasks
|
||||
where they need to follow instructions, or for conversational use. This is where the instruct (chat) versions come in.
|
||||
These checkpoints are the result of further fine-tuning of the pre-trained base versions on instructions and conversational data.
|
||||
This additional fine-tuning makes them a better choice for many NLP tasks.
|
||||
|
||||
Let's illustrate some simple prompts that you can use with [`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b-instruct)
|
||||
to solve some common NLP tasks.
|
||||
|
||||
### NLP tasks
|
||||
|
||||
First, let's set up the environment:
|
||||
|
||||
```bash
|
||||
pip install -q transformers accelerate
|
||||
```
|
||||
|
||||
Next, let's load the model with the appropriate pipeline (`"text-generation"`):
|
||||
|
||||
```python
|
||||
>>> from transformers import pipeline, AutoTokenizer
|
||||
>>> import torch
|
||||
|
||||
>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
|
||||
>>> model = "tiiuae/falcon-7b-instruct"
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
>>> pipe = pipeline(
|
||||
... "text-generation",
|
||||
... model=model,
|
||||
... tokenizer=tokenizer,
|
||||
... torch_dtype=torch.bfloat16,
|
||||
... device_map="auto",
|
||||
... )
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Note that Falcon models were trained using the `bfloat16` datatype, so we recommend you use the same. This requires a recent
|
||||
version of CUDA and works best on modern cards.
|
||||
|
||||
</Tip>
|
||||
|
||||
Now that we have the model loaded via the pipeline, let's explore how you can use prompts to solve NLP tasks.
|
||||
|
||||
#### Text classification
|
||||
|
||||
One of the most common forms of text classification is sentiment analysis, which assigns a label like "positive", "negative",
|
||||
or "neutral" to a sequence of text. Let's write a prompt that instructs the model to classify a given text (a movie review).
|
||||
We'll start by giving the instruction, and then specifying the text to classify. Note that instead of leaving it at that, we're
|
||||
also adding the beginning of the response - `"Sentiment: "`:
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """Classify the text into neutral, negative or positive.
|
||||
... Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
|
||||
... Sentiment:
|
||||
... """
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=10,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"Result: {seq['generated_text']}")
|
||||
Result: Classify the text into neutral, negative or positive.
|
||||
Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
|
||||
Sentiment:
|
||||
Positive
|
||||
```
|
||||
|
||||
As a result, the output contains a classification label from the list we have provided in the instructions, and it is a correct one!
|
||||
|
||||
<Tip>
|
||||
|
||||
You may notice that in addition to the prompt, we pass a `max_new_tokens` parameter. It controls the number of tokens the
|
||||
model shall generate, and it is one of the many text generation parameters that you can learn about
|
||||
in [Text generation strategies](../generation_strategies) guide.
|
||||
|
||||
</Tip>
|
||||
|
||||
#### Named Entity Recognition
|
||||
|
||||
Named Entity Recognition (NER) is a task of finding named entities in a piece of text, such as a person, location, or organization.
|
||||
Let's modify the instructions in the prompt to make the LLM perform this task. Here, let's also set `return_full_text = False`
|
||||
so that output doesn't contain the prompt:
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(1) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """Return a list of named entities in the text.
|
||||
... Text: The Golden State Warriors are an American professional basketball team based in San Francisco.
|
||||
... Named entities:
|
||||
... """
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=15,
|
||||
... return_full_text = False,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"{seq['generated_text']}")
|
||||
- Golden State Warriors
|
||||
- San Francisco
|
||||
```
|
||||
|
||||
As you can see, the model correctly identified two named entities from the given text.
|
||||
|
||||
#### Translation
|
||||
|
||||
Another task LLMs can perform is translation. You can choose to use encoder-decoder models for this task, however, here,
|
||||
for the simplicity of the examples, we'll keep using Falcon-7b-instruct, which does a decent job. Once again, here's how
|
||||
you can write a basic prompt to instruct a model to translate a piece of text from English to Italian:
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(2) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """Translate the English text to Italian.
|
||||
... Text: Sometimes, I've believed as many as six impossible things before breakfast.
|
||||
... Translation:
|
||||
... """
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=20,
|
||||
... do_sample=True,
|
||||
... top_k=10,
|
||||
... return_full_text = False,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"{seq['generated_text']}")
|
||||
A volte, ho creduto a sei impossibili cose prima di colazione.
|
||||
```
|
||||
|
||||
Here we've added a `do_sample=True` and `top_k=10` to allow the model to be a bit more flexible when generating output.
|
||||
|
||||
#### Text summarization
|
||||
|
||||
Similar to the translation, text summarization is another generative task where the output **heavily** relies on the input,
|
||||
and encoder-decoder models can be a better choice. However, decoder-style models can be used for this task as well.
|
||||
Previously, we have placed the instructions at the very beginning of the prompt. However, the very end of the prompt can
|
||||
also be a suitable location for instructions. Typically, it's better to place the instruction on one of the extreme ends.
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(3) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """Permaculture is a design process mimicking the diversity, functionality and resilience of natural ecosystems. The principles and practices are drawn from traditional ecological knowledge of indigenous cultures combined with modern scientific understanding and technological innovations. Permaculture design provides a framework helping individuals and communities develop innovative, creative and effective strategies for meeting basic needs while preparing for and mitigating the projected impacts of climate change.
|
||||
... Write a summary of the above text.
|
||||
... Summary:
|
||||
... """
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=30,
|
||||
... do_sample=True,
|
||||
... top_k=10,
|
||||
... return_full_text = False,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"{seq['generated_text']}")
|
||||
Permaculture is an ecological design mimicking natural ecosystems to meet basic needs and prepare for climate change. It is based on traditional knowledge and scientific understanding.
|
||||
```
|
||||
|
||||
#### Question answering
|
||||
|
||||
For question answering task we can structure the prompt into the following logical components: instructions, context, question, and
|
||||
the leading word or phrase (`"Answer:"`) to nudge the model to start generating the answer:
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(4) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """Answer the question using the context below.
|
||||
... Context: Gazpacho is a cold soup and drink made of raw, blended vegetables. Most gazpacho includes stale bread, tomato, cucumbers, onion, bell peppers, garlic, olive oil, wine vinegar, water, and salt. Northern recipes often include cumin and/or piment├│n (smoked sweet paprika). Traditionally, gazpacho was made by pounding the vegetables in a mortar with a pestle; this more laborious method is still sometimes used as it helps keep the gazpacho cool and avoids the foam and silky consistency of smoothie versions made in blenders or food processors.
|
||||
... Question: What modern tool is used to make gazpacho?
|
||||
... Answer:
|
||||
... """
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=10,
|
||||
... do_sample=True,
|
||||
... top_k=10,
|
||||
... return_full_text = False,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"Result: {seq['generated_text']}")
|
||||
Result: Modern tools are used, such as immersion blenders
|
||||
```
|
||||
|
||||
#### Reasoning
|
||||
|
||||
Reasoning is one of the most difficult tasks for LLMs, and achieving good results often requires applying advanced prompting techniques, like
|
||||
[Chain-of-though](#chain-of-thought).
|
||||
|
||||
Let's try if we can make a model reason about a simple arithmetics task with a basic prompt:
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(5) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """There are 5 groups of students in the class. Each group has 4 students. How many students are there in the class?"""
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=30,
|
||||
... do_sample=True,
|
||||
... top_k=10,
|
||||
... return_full_text = False,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"Result: {seq['generated_text']}")
|
||||
Result:
|
||||
There are a total of 5 groups, so there are 5 x 4=20 students in the class.
|
||||
```
|
||||
|
||||
Correct! Let's increase the complexity a little and see if we can still get away with a basic prompt:
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(6) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """I baked 15 muffins. I ate 2 muffins and gave 5 muffins to a neighbor. My partner then bought 6 more muffins and ate 2. How many muffins do we now have?"""
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=10,
|
||||
... do_sample=True,
|
||||
... top_k=10,
|
||||
... return_full_text = False,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"Result: {seq['generated_text']}")
|
||||
Result:
|
||||
The total number of muffins now is 21
|
||||
```
|
||||
|
||||
This is a wrong answer, it should be 12. In this case, this can be due to the prompt being too basic, or due to the choice
|
||||
of model, after all we've picked the smallest version of Falcon. Reasoning is difficult for models of all sizes, but larger
|
||||
models are likely to perform better.
|
||||
|
||||
## Best practices of LLM prompting
|
||||
|
||||
In this section of the guide we have compiled a list of best practices that tend to improve the prompt results:
|
||||
|
||||
* When choosing the model to work with, the latest and most capable models are likely to perform better.
|
||||
* Start with a simple and short prompt, and iterate from there.
|
||||
* Put the instructions at the beginning of the prompt, or at the very end. When working with large context, models apply various optimizations to prevent Attention complexity from scaling quadratically. This may make a model more attentive to the beginning or end of a prompt than the middle.
|
||||
* Clearly separate instructions from the text they apply to - more on this in the next section.
|
||||
* Be specific and descriptive about the task and the desired outcome - its format, length, style, language, etc.
|
||||
* Avoid ambiguous descriptions and instructions.
|
||||
* Favor instructions that say "what to do" instead of those that say "what not to do".
|
||||
* "Lead" the output in the right direction by writing the first word (or even begin the first sentence for the model).
|
||||
* Use advanced techniques like [Few-shot prompting](#few-shot-prompting) and [Chain-of-thought](#chain-of-thought)
|
||||
* Test your prompts with different models to assess their robustness.
|
||||
* Version and track the performance of your prompts.
|
||||
|
||||
## Advanced prompting techniques
|
||||
|
||||
### Few-shot prompting
|
||||
|
||||
The basic prompts in the sections above are the examples of "zero-shot" prompts, meaning, the model has been given
|
||||
instructions and context, but no examples with solutions. LLMs that have been fine-tuned on instruction datasets, generally
|
||||
perform well on such "zero-shot" tasks. However, you may find that your task has more complexity or nuance, and, perhaps,
|
||||
you have some requirements for the output that the model doesn't catch on just from the instructions. In this case, you can
|
||||
try the technique called few-shot prompting.
|
||||
|
||||
In few-shot prompting, we provide examples in the prompt giving the model more context to improve the performance.
|
||||
The examples condition the model to generate the output following the patterns in the examples.
|
||||
|
||||
Here's an example:
|
||||
|
||||
```python
|
||||
>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
|
||||
>>> prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961.
|
||||
... Date: 04/12/1961
|
||||
... Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon.
|
||||
... Date:"""
|
||||
|
||||
>>> sequences = pipe(
|
||||
... prompt,
|
||||
... max_new_tokens=8,
|
||||
... do_sample=True,
|
||||
... top_k=10,
|
||||
... )
|
||||
|
||||
>>> for seq in sequences:
|
||||
... print(f"Result: {seq['generated_text']}")
|
||||
Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
|
||||
Date: 04/12/1961
|
||||
Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon.
|
||||
Date: 09/28/1960
|
||||
```
|
||||
|
||||
In the above code snippet we used a single example to demonstrate the desired output to the model, so this can be called a
|
||||
"one-shot" prompting. However, depending on the task complexity you may need to use more than one example.
|
||||
|
||||
Limitations of the few-shot prompting technique:
|
||||
- While LLMs can pick up on the patterns in the examples, these technique doesn't work well on complex reasoning tasks
|
||||
- Few-shot prompting requires creating lengthy prompts. Prompts with large number of tokens can increase computation and latency. There's also a limit to the length of the prompts.
|
||||
- Sometimes when given a number of examples, models can learn patterns that you didn't intend them to learn, e.g. that the third movie review is always negative.
|
||||
|
||||
### Chain-of-thought
|
||||
|
||||
Chain-of-thought (CoT) prompting is a technique that nudges a model to produce intermediate reasoning steps thus improving
|
||||
the results on complex reasoning tasks.
|
||||
|
||||
There are two ways of steering a model to producing the reasoning steps:
|
||||
- few-shot prompting by illustrating examples with detailed answers to questions, showing the model how to work through a problem.
|
||||
- by instructing the model to reason by adding phrases like "Let's think step by step" or "Take a deep breath and work through the problem step by step."
|
||||
|
||||
If we apply the CoT technique to the muffins example from the [reasoning section](#reasoning) and use a larger model,
|
||||
such as (`tiiuae/falcon-180B-chat`) which you can play with in the [HuggingChat](https://huggingface.co/chat/),
|
||||
we'll get a significant improvement on the reasoning result:
|
||||
|
||||
```text
|
||||
Let's go through this step-by-step:
|
||||
1. You start with 15 muffins.
|
||||
2. You eat 2 muffins, leaving you with 13 muffins.
|
||||
3. You give 5 muffins to your neighbor, leaving you with 8 muffins.
|
||||
4. Your partner buys 6 more muffins, bringing the total number of muffins to 14.
|
||||
5. Your partner eats 2 muffins, leaving you with 12 muffins.
|
||||
Therefore, you now have 12 muffins.
|
||||
```
|
||||
|
||||
## Prompting vs fine-tuning
|
||||
|
||||
You can achieve great results by optimizing your prompts, however, you may still ponder whether fine-tuning a model
|
||||
would work better for your case. Here are some scenarios when fine-tuning a smaller model may be a preferred option:
|
||||
|
||||
- Your domain is wildly different from what LLMs were pre-trained on and extensive prompt optimization did not yield sufficient results.
|
||||
- You need your model to work well in a low-resource language.
|
||||
- You need the model to be trained on sensitive data that is under strict regulations.
|
||||
- You have to use a small model due to cost, privacy, infrastructure or other limitations.
|
||||
|
||||
In all of the above examples, you will need to make sure that you either already have or can easily obtain a large enough
|
||||
domain-specific dataset at a reasonable cost to fine-tune a model. You will also need to have enough time and resources
|
||||
to fine-tune a model.
|
||||
|
||||
If the above examples are not the case for you, optimizing prompts can prove to be more beneficial.
|
||||
|
||||
|
||||
@ -206,7 +206,7 @@ The transform is applied on the fly which is faster and consumes less disk space
|
||||
|
||||
## Evaluate
|
||||
|
||||
Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the ЁЯдЧ [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) metric (see the ЁЯдЧ Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
|
||||
Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the ЁЯдЧ [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) metric (see the ЁЯдЧ Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
|
||||
|
||||
```py
|
||||
>>> import evaluate
|
||||
|
||||
@ -33,7 +33,7 @@ The task illustrated in this tutorial is supported by the following model archit
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
|
||||
[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [CodeLlama](../model_doc/code_llama), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nystr├╢mformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [CodeLlama](../model_doc/code_llama), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nystr├╢mformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
|
||||
|
||||
|
||||
|
||||
|
||||
@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
|
||||
[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
|
||||
|
||||
<!--End of the generated tip-->
|
||||
|
||||
|
||||
@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit
|
||||
|
||||
<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
|
||||
|
||||
[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
|
||||
[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
|
||||
|
||||
<!--End of the generated tip-->
|
||||
|
||||
|
||||
@ -57,8 +57,6 @@ RUN_SLOW=1 pytest examples/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Choosing which tests to run
|
||||
|
||||
This document goes into many details of how tests can be run. If after reading everything, you need even more details
|
||||
@ -184,6 +182,7 @@ pytest -k "test and ada" tests/test_optimization.py
|
||||
### Run `accelerate` tests
|
||||
|
||||
Sometimes you need to run `accelerate` tests on your models. For that you can just add `-m accelerate_tests` to your command, if let's say you want to run these tests on `OPT` run:
|
||||
|
||||
```bash
|
||||
RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
|
||||
```
|
||||
@ -514,6 +513,7 @@ n_gpu = get_gpu_count() # works with torch and tf
|
||||
### Testing with a specific PyTorch backend or device
|
||||
|
||||
To run the test suite on a specific torch device add `TRANSFORMERS_TEST_DEVICE="$device"` where `$device` is the target backend. For example, to test on CPU only:
|
||||
|
||||
```bash
|
||||
TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py
|
||||
```
|
||||
@ -521,9 +521,29 @@ TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py
|
||||
This variable is useful for testing custom or less common PyTorch backends such as `mps`. It can also be used to achieve the same effect as `CUDA_VISIBLE_DEVICES` by targeting specific GPUs or testing in CPU-only mode.
|
||||
|
||||
Certain devices will require an additional import after importing `torch` for the first time. This can be specified using the environment variable `TRANSFORMERS_TEST_BACKEND`:
|
||||
|
||||
```bash
|
||||
TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py
|
||||
```
|
||||
Alternative backends may also require the replacement of device-specific functions. For example `torch.cuda.manual_seed` may need to be replaced with a device-specific seed setter like `torch.npu.manual_seed` to correctly set a random seed on the device. To specify a new backend with backend-specific device functions when running the test suite, create a Python device specification file in the format:
|
||||
|
||||
```
|
||||
import torch
|
||||
import torch_npu
|
||||
# !! Further additional imports can be added here !!
|
||||
|
||||
# Specify the device name (eg. 'cuda', 'cpu', 'npu')
|
||||
DEVICE_NAME = 'npu'
|
||||
|
||||
# Specify device-specific backends to dispatch to.
|
||||
# If not specified, will fallback to 'default' in 'testing_utils.py`
|
||||
MANUAL_SEED_FN = torch.npu.manual_seed
|
||||
EMPTY_CACHE_FN = torch.npu.empty_cache
|
||||
DEVICE_COUNT_FN = torch.npu.device_count
|
||||
```
|
||||
This format also allows for specification of any additional imports required. To use this file to replace equivalent methods in the test suite, set the environment variable `TRANSFORMERS_TEST_DEVICE_SPEC` to the path of the spec file.
|
||||
|
||||
Currently, only `MANUAL_SEED_FN`, `EMPTY_CACHE_FN` and `DEVICE_COUNT_FN` are supported for device-specific dispatch.
|
||||
|
||||
|
||||
### Distributed training
|
||||
@ -879,7 +899,8 @@ or the `xfail` way:
|
||||
def test_feature_x():
|
||||
```
|
||||
|
||||
- Here is how to skip a test based on some internal check inside the test:
|
||||
|
||||
Here's how to skip a test based on internal checks within the test:
|
||||
|
||||
```python
|
||||
def test_feature_x():
|
||||
|
||||
3
docs/source/hi/_toctree.yml
Normal file
3
docs/source/hi/_toctree.yml
Normal file
@ -0,0 +1,3 @@
|
||||
- sections:
|
||||
- local: pipeline_tutorial
|
||||
title: рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХреЗ рд╕рд╛рде рдЕрдиреБрдорд╛рди рдЪрд▓рд╛рдПрдБ
|
||||
317
docs/source/hi/pipeline_tutorial.md
Normal file
317
docs/source/hi/pipeline_tutorial.md
Normal file
@ -0,0 +1,317 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
тЪая╕П Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# рдЕрдиреБрдорд╛рди рдХреЗ рд▓рд┐рдП рдкрд╛рдЗрдкрд▓рд╛рдЗрди
|
||||
|
||||
[`pipeline`] рдХрд┐рд╕реА рднреА рднрд╛рд╖рд╛, рдХрдВрдкреНрдпреВрдЯрд░ рджреГрд╖реНрдЯрд┐, рднрд╛рд╖рдг рдФрд░ рдорд▓реНрдЯреАрдореЙрдбрд▓ рдХрд╛рд░реНрдпреЛрдВ рдкрд░ рдЕрдиреБрдорд╛рди рд▓рдЧрд╛рдиреЗ рдХреЗ рд▓рд┐рдП [Hub] (https://huggingface.co/models) рд╕реЗ рдХрд┐рд╕реА рднреА рдореЙрдбрд▓ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛ рдЖрд╕рд╛рди рдмрдирд╛рддрд╛ рд╣реИред рднрд▓реЗ рд╣реА рдЖрдкрдХреЗ рдкрд╛рд╕ рдХрд┐рд╕реА рд╡рд┐рд╢рд┐рд╖реНрдЯ рддреМрд░-рддрд░реАрдХреЗ рдХрд╛ рдЕрдиреБрднрд╡ рди рд╣реЛ рдпрд╛ рдЖрдк рдореЙрдбрд▓реЛрдВ рдХреЗ рдкреАрдЫреЗ рдЕрдВрддрд░реНрдирд┐рд╣рд┐рдд рдХреЛрдб рд╕реЗ рдкрд░рд┐рдЪрд┐рдд рди рд╣реЛрдВ, рдлрд┐рд░ рднреА рдЖрдк [`pipeline`] рдХреЗ рдЕрдиреБрдорд╛рди рдХреЗ рд▓рд┐рдП рдЙрдирдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ! рдпрд╣ рдЯреНрдпреВрдЯреЛрд░рд┐рдпрд▓ рдЖрдкрдХреЛ рдпреЗ рд╕рд┐рдЦрд╛рдПрдЧрд╛:
|
||||
|
||||
* рдЕрдиреБрдорд╛рди рдХреЗ рд▓рд┐рдП [`pipeline`] рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░реЗрдВред
|
||||
* рдПрдХ рд╡рд┐рд╢рд┐рд╖реНрдЯ рдЯреЛрдХрдирдирд╛рдЗрдЬрд╝рд░ рдпрд╛ рдореЙрдбрд▓ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░реЗрдВред
|
||||
* рдСрдбрд┐рдпреЛ, рд╡рд┐рдЬрд╝рди рдФрд░ рдорд▓реНрдЯреАрдореЙрдбрд▓ рдХрд╛рд░реНрдпреЛрдВ рдХреЗ рд▓рд┐рдП [`pipeline`] рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░реЗрдВред
|
||||
|
||||
<Tip>
|
||||
|
||||
рд╕рдорд░реНрдерд┐рдд рдХрд╛рд░реНрдпреЛрдВ рдФрд░ рдЙрдкрд▓рдмреНрдз рдорд╛рдкрджрдВрдбреЛрдВ рдХреА рдкреВрд░реА рд╕реВрдЪреА рдХреЗ рд▓рд┐рдП [`pipeline`] рджрд╕реНрддрд╛рд╡реЗрдЬрд╝ рдкрд░ рдПрдХ рдирдЬрд╝рд░ рдбрд╛рд▓реЗрдВред
|
||||
|
||||
</Tip>
|
||||
|
||||
## рдкрд╛рдЗрдкрд▓рд╛рдЗрди рдХрд╛ рдЙрдкрдпреЛрдЧ
|
||||
|
||||
рдЬрдмрдХрд┐ рдкреНрд░рддреНрдпреЗрдХ рдХрд╛рд░реНрдп рдореЗрдВ рдПрдХ рд╕рдВрдмрджреНрдз [`pipeline`] рд╣реЛрддрд╛ рд╣реИ, рд╕рд╛рдорд╛рдиреНрдп [`pipeline`] рдЕрдореВрд░реНрдд рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛ рдЖрд╕рд╛рди рд╣реЛрддрд╛ рд╣реИ рдЬрд┐рд╕рдореЗрдВ рд╢рд╛рдорд┐рд▓ рд╣реЛрддрд╛ рд╣реИ
|
||||
рд╕рднреА рдХрд╛рд░реНрдп-рд╡рд┐рд╢рд┐рд╖реНрдЯ рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЗрдВред [`pipeline`] рд╕реНрд╡рдЪрд╛рд▓рд┐рдд рд░реВрдк рд╕реЗ рдПрдХ рдбрд┐рдлрд╝реЙрд▓реНрдЯ рдореЙрдбрд▓ рдФрд░ рд╕рдХреНрд╖рдо рдкреНрд░реАрдкреНрд░реЛрд╕реЗрд╕рд┐рдВрдЧ рдХреНрд▓рд╛рд╕ рд▓реЛрдб рдХрд░рддрд╛ рд╣реИ
|
||||
рдЖрдкрдХреЗ рдХрд╛рд░реНрдп рдХреЗ рд▓рд┐рдП рдЕрдиреБрдорд╛рди рдХрд╛. рдЖрдЗрдП рд╕реНрд╡рдЪрд╛рд▓рд┐рдд рд╡рд╛рдХреН рдкрд╣рдЪрд╛рди (рдПрдПрд╕рдЖрд░) рдХреЗ рд▓рд┐рдП [`pipeline`] рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдиреЗ рдХрд╛ рдЙрджрд╛рд╣рд░рдг рд▓реЗрдВ, рдпрд╛
|
||||
рд╡рд╛рдХреН-рд╕реЗ-рдкрд╛рда.
|
||||
|
||||
|
||||
1. рдПрдХ [`pipeline`] рдмрдирд╛рдХрд░ рдкреНрд░рд╛рд░рдВрдн рдХрд░реЗрдВ рдФрд░ рдЕрдиреБрдорд╛рди рдХрд╛рд░реНрдп рдирд┐рд░реНрджрд┐рд╖реНрдЯ рдХрд░реЗрдВ:
|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> transcriber = pipeline(task="automatic-speech-recognition")
|
||||
```
|
||||
|
||||
2. рдЕрдкрдирд╛ рдЗрдирдкреБрдЯ [`pipeline`] рдкрд░ рднреЗрдЬреЗрдВред рд╡рд╛рдХреН рдкрд╣рдЪрд╛рди рдХреЗ рдорд╛рдорд▓реЗ рдореЗрдВ, рдпрд╣ рдПрдХ рдСрдбрд┐рдпреЛ рдЗрдирдкреБрдЯ рдлрд╝рд╛рдЗрд▓ рд╣реИ:
|
||||
|
||||
```py
|
||||
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
|
||||
```
|
||||
|
||||
рдХреНрдпрд╛ рд╡рд╣ рдкрд░рд┐рдгрд╛рдо рдирд╣реАрдВ рдЬреЛ рдЖрдкрдХреЗ рдорди рдореЗрдВ рдерд╛? рдХреБрдЫ [рд╕рдмрд╕реЗ рдЕрдзрд┐рдХ рдбрд╛рдЙрдирд▓реЛрдб рдХрд┐рдП рдЧрдП рд╕реНрд╡рдЪрд╛рд▓рд┐рдд рд╡рд╛рдХреН рдкрд╣рдЪрд╛рди рдореЙрдбрд▓](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) рджреЗрдЦреЗрдВ
|
||||
рдпрд╣ рджреЗрдЦрдиреЗ рдХреЗ рд▓рд┐рдП рд╣рдм рдкрд░ рдЬрд╛рдПрдВ рдХрд┐ рдХреНрдпрд╛ рдЖрдкрдХреЛ рдмреЗрд╣рддрд░ рдЯреНрд░рд╛рдВрд╕реНрдХреНрд░рд┐рдкреНрд╢рди рдорд┐рд▓ рд╕рдХрддрд╛ рд╣реИред
|
||||
|
||||
рдЖрдЗрдП OpenAI рд╕реЗ [рд╡реНрд╣рд┐рд╕реНрдкрд░ рд▓рд╛рд░реНрдЬ-v2](https://huggingface.co/openai/whisper-large) рдореЙрдбрд▓ рдЖрдЬрд╝рдорд╛рдПрдВред рд╡реНрд╣рд┐рд╕реНрдкрд░ рдЬрд╛рд░реА рдХрд┐рдпрд╛ рдЧрдпрд╛
|
||||
Wav2Vec2 рдХреА рддреБрд▓рдирд╛ рдореЗрдВ 2 рд╕рд╛рд▓ рдмрд╛рдж, рдФрд░ рд▓рдЧрднрдЧ 10 рдЧреБрдирд╛ рдЕрдзрд┐рдХ рдбреЗрдЯрд╛ рдкрд░ рдкреНрд░рд╢рд┐рдХреНрд╖рд┐рдд рдХрд┐рдпрд╛ рдЧрдпрд╛ рдерд╛ред рдЗрд╕ рдкреНрд░рдХрд╛рд░, рдпрд╣ рдЕрдзрд┐рдХрд╛рдВрд╢ рдбрд╛рдЙрдирд╕реНрдЯреНрд░реАрдо рдкрд░ Wav2Vec2 рдХреЛ рдорд╛рдд рджреЗрддрд╛ рд╣реИ
|
||||
рдмреЗрдВрдЪрдорд╛рд░реНрдХ. рдЗрд╕рдореЗрдВ рд╡рд┐рд░рд╛рдо рдЪрд┐рд╣реНрди рдФрд░ рдЖрд╡рд░рдг рдХреА рднрд╡рд┐рд╖реНрдпрд╡рд╛рдгреА рдХрд░рдиреЗ рдХрд╛ рдЕрддрд┐рд░рд┐рдХреНрдд рд▓рд╛рдн рднреА рд╣реИ, рдЬрд┐рдирдореЗрдВ рд╕реЗ рдХреЛрдИ рднреА рд╕рдВрднрд╡ рдирд╣реАрдВ рд╣реИ
|
||||
Wav2Vec2.
|
||||
|
||||
рдЖрдЗрдП рдЗрд╕реЗ рдпрд╣рд╛рдВ рдЖрдЬрд╝рдорд╛рдХрд░ рджреЗрдЦреЗрдВ рдХрд┐ рдпрд╣ рдХреИрд╕рд╛ рдкреНрд░рджрд░реНрд╢рди рдХрд░рддрд╛ рд╣реИ:
|
||||
|
||||
```py
|
||||
>>> transcriber = pipeline(model="openai/whisper-large-v2")
|
||||
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
|
||||
```
|
||||
|
||||
рдЕрдм рдпрд╣ рдкрд░рд┐рдгрд╛рдо рдЕрдзрд┐рдХ рд╕рдЯреАрдХ рджрд┐рдЦрддрд╛ рд╣реИ! Wav2Vec2 рдмрдирд╛рдо рд╡реНрд╣рд┐рд╕реНрдкрд░ рдкрд░ рдЧрд╣рди рддреБрд▓рдирд╛ рдХреЗ рд▓рд┐рдП, [рдСрдбрд┐рдпреЛ рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░реНрд╕ рдХреЛрд░реНрд╕] (https://huggingface.co/learn/audio-course/chapter5/asr_models) рджреЗрдЦреЗрдВред
|
||||
рд╣рдо рд╡рд╛рд╕реНрддрд╡ рдореЗрдВ рдЖрдкрдХреЛ рд╡рд┐рднрд┐рдиреНрди рднрд╛рд╖рд╛рдУрдВ рдореЗрдВ рдореЙрдбрд▓, рдЖрдкрдХреЗ рдХреНрд╖реЗрддреНрд░ рдореЗрдВ рд╡рд┐рд╢реЗрд╖реАрдХреГрдд рдореЙрдбрд▓ рдФрд░ рдмрд╣реБрдд рдХреБрдЫ рдХреЗ рд▓рд┐рдП рд╣рдм рдХреА рдЬрд╛рдВрдЪ рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП рдкреНрд░реЛрддреНрд╕рд╛рд╣рд┐рдд рдХрд░рддреЗ рд╣реИрдВред
|
||||
рдЖрдк рд╣рдм рдкрд░ рд╕реАрдзреЗ рдЕрдкрдиреЗ рдмреНрд░рд╛рдЙрдЬрд╝рд░ рд╕реЗ рдореЙрдбрд▓ рдкрд░рд┐рдгрд╛рдореЛрдВ рдХреА рдЬрд╛рдВрдЪ рдФрд░ рддреБрд▓рдирд╛ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ рдХрд┐ рдпрд╣ рдлрд┐рдЯ рдмреИрдарддрд╛ рд╣реИ рдпрд╛ рдирд╣реАрдВ
|
||||
рдЕрдиреНрдп рдорд╛рдорд▓реЛрдВ рдХреА рддреБрд▓рдирд╛ рдореЗрдВ рдХреЛрдиреЗ рдХреЗ рдорд╛рдорд▓реЛрдВ рдХреЛ рдмреЗрд╣рддрд░ рдврдВрдЧ рд╕реЗ рд╕рдВрднрд╛рд▓рддрд╛ рд╣реИред
|
||||
рдФрд░ рдпрджрд┐ рдЖрдкрдХреЛ рдЕрдкрдиреЗ рдЙрдкрдпреЛрдЧ рдХреЗ рдорд╛рдорд▓реЗ рдХреЗ рд▓рд┐рдП рдХреЛрдИ рдореЙрдбрд▓ рдирд╣реАрдВ рдорд┐рд▓рддрд╛ рд╣реИ, рддреЛ рдЖрдк рд╣рдореЗрд╢рд╛ рдЕрдкрдирд╛ рдЦреБрдж рдХрд╛ [рдкреНрд░рд╢рд┐рдХреНрд╖рдг](training) рд╢реБрд░реВ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ!
|
||||
|
||||
рдпрджрд┐ рдЖрдкрдХреЗ рдкрд╛рд╕ рдХрдИ рдЗрдирдкреБрдЯ рд╣реИрдВ, рддреЛ рдЖрдк рдЕрдкрдиреЗ рдЗрдирдкреБрдЯ рдХреЛ рдПрдХ рд╕реВрдЪреА рдХреЗ рд░реВрдк рдореЗрдВ рдкрд╛рд╕ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ:
|
||||
|
||||
```py
|
||||
transcriber(
|
||||
[
|
||||
"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
|
||||
"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЗрдВ рдкреНрд░рдпреЛрдЧ рдХреЗ рд▓рд┐рдП рдмрд╣реБрдд рдЕрдЪреНрдЫреА рд╣реИрдВ рдХреНрдпреЛрдВрдХрд┐ рдПрдХ рдореЙрдбрд▓ рд╕реЗ рджреВрд╕рд░реЗ рдореЙрдбрд▓ рдкрд░ рд╕реНрд╡рд┐рдЪ рдХрд░рдирд╛ рдорд╛рдореВрд▓реА рдХрд╛рдо рд╣реИ; рд╣рд╛рд▓рд╛рдБрдХрд┐, рдкреНрд░рдпреЛрдЧ рдХреА рддреБрд▓рдирд╛ рдореЗрдВ рдмрдбрд╝реЗ рдХрд╛рд░реНрдпрднрд╛рд░ рдХреЗ рд▓рд┐рдП рдЙрдиреНрд╣реЗрдВ рдЕрдиреБрдХреВрд▓рд┐рдд рдХрд░рдиреЗ рдХреЗ рдХреБрдЫ рддрд░реАрдХреЗ рд╣реИрдВред рд╕рдВрдкреВрд░реНрдг рдбреЗрдЯрд╛рд╕реЗрдЯ рдкрд░ рдкреБрдирд░рд╛рд╡реГрддреНрддрд┐ рдХрд░рдиреЗ рдпрд╛ рд╡реЗрдмрд╕рд░реНрд╡рд░ рдореЗрдВ рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдиреЗ рдХреЗ рдмрд╛рд░реЗ рдореЗрдВ рдирд┐рдореНрдирд▓рд┐рдЦрд┐рдд рдорд╛рд░реНрдЧрджрд░реНрд╢рд┐рдХрд╛рдПрдБ рджреЗрдЦреЗрдВ:
|
||||
рджрд╕реНрддрд╛рд╡реЗрдЬрд╝реЛрдВ рдореЗрдВ рд╕реЗ:
|
||||
* [рдбреЗрдЯрд╛рд╕реЗрдЯ рдкрд░ рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛](#using-pipelines-on-a-dataset)
|
||||
* [рд╡реЗрдмрд╕рд░реНрд╡рд░ рдХреЗ рд▓рд┐рдП рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛](./pipeline_webserver)
|
||||
|
||||
## рдкреНрд░рд╛рдЪрд▓
|
||||
|
||||
[`pipeline`] рдХрдИ рдорд╛рдкрджрдВрдбреЛрдВ рдХрд╛ рд╕рдорд░реНрдерди рдХрд░рддрд╛ рд╣реИ; рдХреБрдЫ рдХрд╛рд░реНрдп рд╡рд┐рд╢рд┐рд╖реНрдЯ рд╣реИрдВ, рдФрд░ рдХреБрдЫ рд╕рднреА рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХреЗ рд▓рд┐рдП рд╕рд╛рдорд╛рдиреНрдп рд╣реИрдВред
|
||||
рд╕рд╛рдорд╛рдиреНрдп рддреМрд░ рдкрд░, рдЖрдк рдЕрдкрдиреА рдЗрдЪреНрдЫрд╛рдиреБрд╕рд╛рд░ рдХрд╣реАрдВ рднреА рдкреИрд░рд╛рдореАрдЯрд░ рдирд┐рд░реНрджрд┐рд╖реНрдЯ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ:
|
||||
|
||||
```py
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
|
||||
|
||||
out = transcriber(...) # This will use `my_parameter=1`.
|
||||
out = transcriber(..., my_parameter=2) # This will override and use `my_parameter=2`.
|
||||
out = transcriber(...) # This will go back to using `my_parameter=1`.
|
||||
```
|
||||
|
||||
рдЖрдЗрдП 3 рдорд╣рддреНрд╡рдкреВрд░реНрдг рдмрд╛рддреЛрдВ рдкрд░ рдЧреМрд░ рдХрд░реЗрдВ:
|
||||
|
||||
### рдЙрдкрдХрд░рдг
|
||||
|
||||
рдпрджрд┐ рдЖрдк `device=0` рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рддреЗ рд╣реИрдВ, рддреЛ рдкрд╛рдЗрдкрд▓рд╛рдЗрди рд╕реНрд╡рдЪрд╛рд▓рд┐рдд рд░реВрдк рд╕реЗ рдореЙрдбрд▓ рдХреЛ рдирд┐рд░реНрджрд┐рд╖реНрдЯ рдбрд┐рд╡рд╛рдЗрд╕ рдкрд░ рдбрд╛рд▓ рджреЗрддреА рд╣реИред
|
||||
рдпрд╣ рдЗрд╕ рдкрд░ рдзреНрдпрд╛рди рджрд┐рдП рдмрд┐рдирд╛ рдХрд╛рдо рдХрд░реЗрдЧрд╛ рдХрд┐ рдЖрдк PyTorch рдпрд╛ Tensorflow рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░ рд░рд╣реЗ рд╣реИрдВ рдпрд╛ рдирд╣реАрдВред
|
||||
|
||||
```py
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", device=0)
|
||||
```
|
||||
|
||||
рдпрджрд┐ рдореЙрдбрд▓ рдПрдХрд▓ GPU рдХреЗ рд▓рд┐рдП рдмрд╣реБрдд рдмрдбрд╝рд╛ рд╣реИ рдФрд░ рдЖрдк PyTorch рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░ рд░рд╣реЗ рд╣реИрдВ, рддреЛ рдЖрдк `device_map="auto"` рдХреЛ рд╕реНрд╡рдЪрд╛рд▓рд┐рдд рд░реВрдк рд╕реЗ рд╕реЗрдЯ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ
|
||||
рдирд┐рд░реНрдзрд╛рд░рд┐рдд рдХрд░реЗрдВ рдХрд┐ рдореЙрдбрд▓ рд╡рдЬрд╝рди рдХреЛ рдХреИрд╕реЗ рд▓реЛрдб рдФрд░ рд╕рдВрдЧреНрд░рд╣реАрдд рдХрд┐рдпрд╛ рдЬрд╛рдПред `device_map` рддрд░реНрдХ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП ЁЯдЧ [Accelerate] (https://huggingface.co/docs/accelerate) рдХреА рдЖрд╡рд╢реНрдпрдХрддрд╛ рд╣реЛрддреА рд╣реИ
|
||||
рдкреИрдХреЗрдЯ:
|
||||
|
||||
```bash
|
||||
pip install --upgrade accelerate
|
||||
```
|
||||
|
||||
рдирд┐рдореНрдирд▓рд┐рдЦрд┐рдд рдХреЛрдб рд╕реНрд╡рдЪрд╛рд▓рд┐рдд рд░реВрдк рд╕реЗ рд╕рднреА рдбрд┐рд╡рд╛рдЗрд╕реЛрдВ рдореЗрдВ рдореЙрдбрд▓ рднрд╛рд░ рдХреЛ рд▓реЛрдб рдФрд░ рд╕рдВрдЧреНрд░рд╣реАрдд рдХрд░рддрд╛ рд╣реИ:
|
||||
|
||||
```py
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
|
||||
```
|
||||
|
||||
рдзреНрдпрд╛рди рджреЗрдВ рдХрд┐ рдпрджрд┐ `device_map='auto'` рдкрд╛рд░рд┐рдд рд╣реЛ рдЧрдпрд╛ рд╣реИ, рддреЛ рдЕрдкрдиреА `pipeline` рдХреЛ рдЪрд╛рд▓реВ рдХрд░рддреЗ рд╕рдордп `device=device` рддрд░реНрдХ рдЬреЛрдбрд╝рдиреЗ рдХреА рдХреЛрдИ рдЖрд╡рд╢реНрдпрдХрддрд╛ рдирд╣реАрдВ рд╣реИ рдХреНрдпреЛрдВрдХрд┐ рдЖрдкрдХреЛ рдХреБрдЫ рдЕрдкреНрд░рддреНрдпрд╛рд╢рд┐рдд рд╡реНрдпрд╡рд╣рд╛рд░ рдХрд╛ рд╕рд╛рдордирд╛ рдХрд░рдирд╛ рдкрдбрд╝ рд╕рдХрддрд╛ рд╣реИ!
|
||||
|
||||
### рдмреИрдЪ рдХрд╛ рдЖрдХрд╛рд░
|
||||
|
||||
рдбрд┐рдлрд╝реЙрд▓реНрдЯ рд░реВрдк рд╕реЗ, рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЗрдВ [рдпрд╣рд╛рдВ] (https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching) рд╡рд┐рд╕реНрддрд╛рд░ рд╕реЗ рдмрддрд╛рдП рдЧрдП рдХрд╛рд░рдгреЛрдВ рдХреЗ рд▓рд┐рдП рдмреИрдЪ рдЕрдиреБрдорд╛рди рдирд╣реАрдВ рд▓рдЧрд╛рдПрдВрдЧреАред рдЗрд╕рдХрд╛ рдХрд╛рд░рдг рдпрд╣ рд╣реИ рдХрд┐ рдмреИрдЪрд┐рдВрдЧ рдЖрд╡рд╢реНрдпрдХ рд░реВрдк рд╕реЗ рддреЗрдЬрд╝ рдирд╣реАрдВ рд╣реИ, рдФрд░ рд╡рд╛рд╕реНрддрд╡ рдореЗрдВ рдХреБрдЫ рдорд╛рдорд▓реЛрдВ рдореЗрдВ рдХрд╛рдлреА рдзреАрдореА рд╣реЛ рд╕рдХрддреА рд╣реИред
|
||||
|
||||
рд▓реЗрдХрд┐рди рдЕрдЧрд░ рдпрд╣ рдЖрдкрдХреЗ рдЙрдкрдпреЛрдЧ рдХреЗ рдорд╛рдорд▓реЗ рдореЗрдВ рдХрд╛рдо рдХрд░рддрд╛ рд╣реИ, рддреЛ рдЖрдк рдЗрд╕рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ:
|
||||
|
||||
```py
|
||||
transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
|
||||
audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
|
||||
texts = transcriber(audio_filenames)
|
||||
```
|
||||
|
||||
рдпрд╣ рдкреНрд░рджрд╛рди рдХреА рдЧрдИ 4 рдСрдбрд┐рдпреЛ рдлрд╛рдЗрд▓реЛрдВ рдкрд░ рдкрд╛рдЗрдкрд▓рд╛рдЗрди рдЪрд▓рд╛рддрд╛ рд╣реИ, рд▓реЗрдХрд┐рди рдпрд╣ рдЙрдиреНрд╣реЗрдВ 2 рдХреЗ рдмреИрдЪ рдореЗрдВ рдкрд╛рд╕ рдХрд░реЗрдЧрд╛
|
||||
рдЖрдкрд╕реЗ рдХрд┐рд╕реА рдФрд░ рдХреЛрдб рдХреА рдЖрд╡рд╢реНрдпрдХрддрд╛ рдХреЗ рдмрд┐рдирд╛ рдореЙрдбрд▓ (рдЬреЛ рдПрдХ рдЬреАрдкреАрдпреВ рдкрд░ рд╣реИ, рдЬрд╣рд╛рдВ рдмреИрдЪрд┐рдВрдЧ рд╕реЗ рдорджрдж рдорд┐рд▓рдиреЗ рдХреА рдЕрдзрд┐рдХ рд╕рдВрднрд╛рд╡рдирд╛ рд╣реИ) рдкрд░ рдЬрд╛рдПрдВред
|
||||
рдЖрдЙрдЯрдкреБрдЯ рд╣рдореЗрд╢рд╛ рдЙрд╕реА рд╕реЗ рдореЗрд▓ рдЦрд╛рдирд╛ рдЪрд╛рд╣рд┐рдП рдЬреЛ рдЖрдкрдХреЛ рдмреИрдЪрд┐рдВрдЧ рдХреЗ рдмрд┐рдирд╛ рдкреНрд░рд╛рдкреНрдд рд╣реБрдЖ рд╣реЛрдЧрд╛ред рдЗрд╕рдХрд╛ рдЙрджреНрджреЗрд╢реНрдп рдХреЗрд╡рд▓ рдкрд╛рдЗрдкрд▓рд╛рдЗрди рд╕реЗ рдЕрдзрд┐рдХ рдЧрддрд┐ рдкреНрд░рд╛рдкреНрдд рдХрд░рдиреЗ рдореЗрдВ рдЖрдкрдХреА рд╕рд╣рд╛рдпрддрд╛ рдХрд░рдирд╛ рд╣реИред
|
||||
|
||||
рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЗрдВ рдмреИрдЪрд┐рдВрдЧ рдХреА рдХреБрдЫ рдЬрдЯрд┐рд▓рддрд╛рдУрдВ рдХреЛ рднреА рдХрдо рдХрд░ рд╕рдХрддреА рд╣реИрдВ рдХреНрдпреЛрдВрдХрд┐, рдХреБрдЫ рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХреЗ рд▓рд┐рдП, рдПрдХ рдПрдХрд▓ рдЖрдЗрдЯрдо (рдЬреИрд╕реЗ рдПрдХ рд▓рдВрдмреА рдСрдбрд┐рдпреЛ рдлрд╝рд╛рдЗрд▓) рдХреЛ рдПрдХ рдореЙрдбрд▓ рджреНрд╡рд╛рд░рд╛ рд╕рдВрд╕рд╛рдзрд┐рдд рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП рдХрдИ рднрд╛рдЧреЛрдВ рдореЗрдВ рд╡рд┐рднрд╛рдЬрд┐рдд рдХрд░рдиреЗ рдХреА рдЖрд╡рд╢реНрдпрдХрддрд╛ рд╣реЛрддреА рд╣реИред рдкрд╛рдЗрдкрд▓рд╛рдЗрди рдЖрдкрдХреЗ рд▓рд┐рдП рдпрд╣ [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) рдХрд░рддреА рд╣реИред
|
||||
|
||||
### рдХрд╛рд░реНрдп рд╡рд┐рд╢рд┐рд╖реНрдЯ рдкреНрд░рд╛рдЪрд▓
|
||||
|
||||
рд╕рднреА рдХрд╛рд░реНрдп рдХрд╛рд░реНрдп рд╡рд┐рд╢рд┐рд╖реНрдЯ рдкреНрд░рд╛рдЪрд▓ рдкреНрд░рджрд╛рди рдХрд░рддреЗ рд╣реИрдВ рдЬреЛ рдЖрдкрдХреЛ рдЕрдкрдирд╛ рдХрд╛рдо рдкреВрд░рд╛ рдХрд░рдиреЗ рдореЗрдВ рдорджрдж рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП рдЕрддрд┐рд░рд┐рдХреНрдд рд▓рдЪреАрд▓реЗрдкрди рдФрд░ рд╡рд┐рдХрд▓реНрдкреЛрдВ рдХреА рдЕрдиреБрдорддрд┐ рджреЗрддреЗ рд╣реИрдВред
|
||||
рдЙрджрд╛рд╣рд░рдг рдХреЗ рд▓рд┐рдП, [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] рд╡рд┐рдзрд┐ рдореЗрдВ рдПрдХ `return_timestamps` рдкреНрд░рд╛рдЪрд▓ рд╣реИ рдЬреЛ рд╡реАрдбрд┐рдпреЛ рдЙрдкрд╢реАрд░реНрд╖рдХ рдХреЗ рд▓рд┐рдП рдЖрд╢рд╛рдЬрдирдХ рд▓рдЧрддрд╛ рд╣реИ:
|
||||
|
||||
|
||||
```py
|
||||
>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
|
||||
>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
|
||||
```
|
||||
|
||||
рдЬреИрд╕рд╛ рдХрд┐ рдЖрдк рджреЗрдЦ рд╕рдХрддреЗ рд╣реИрдВ, рдореЙрдбрд▓ рдиреЗ рдкрд╛рда рдХрд╛ рдЕрдиреБрдорд╛рди рд▓рдЧрд╛рдпрд╛ рдФрд░ **when** рд╡рд┐рднрд┐рдиреНрди рд╡рд╛рдХреНрдпреЛрдВ рдХрд╛ рдЙрдЪреНрдЪрд╛рд░рдг рдХрд┐рдпрд╛ рдЧрдпрд╛ рддреЛ рдЖрдЙрдЯрдкреБрдЯ рднреА рджрд┐рдпрд╛ред
|
||||
|
||||
рдкреНрд░рддреНрдпреЗрдХ рдХрд╛рд░реНрдп рдХреЗ рд▓рд┐рдП рдХрдИ рдкреНрд░рд╛рдЪрд▓ рдЙрдкрд▓рдмреНрдз рд╣реИрдВ, рдЗрд╕рд▓рд┐рдП рдпрд╣ рджреЗрдЦрдиреЗ рдХреЗ рд▓рд┐рдП рдХрд┐ рдЖрдк рдХрд┐рд╕рдХреЗ рд╕рд╛рде рдЫреЗрдбрд╝рдЫрд╛рдбрд╝ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ, рдкреНрд░рддреНрдпреЗрдХ рдХрд╛рд░реНрдп рдХрд╛ API рд╕рдВрджрд░реНрдн рджреЗрдЦреЗрдВ!
|
||||
рдЙрджрд╛рд╣рд░рдг рдХреЗ рд▓рд┐рдП, [`~transformers.AutomaticSpeechRecognitionPipeline`] рдореЗрдВ рдПрдХ `chunk_length_s` рдкреНрд░рд╛рдЪрд▓ рд╣реИ рдЬреЛ рд╕рд╣рд╛рдпрдХ рд╣реИ
|
||||
рд╡рд╛рд╕реНрддрд╡ рдореЗрдВ рд▓рдВрдмреА рдСрдбрд┐рдпреЛ рдлрд╝рд╛рдЗрд▓реЛрдВ рдкрд░ рдХрд╛рдо рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП (рдЙрджрд╛рд╣рд░рдг рдХреЗ рд▓рд┐рдП, рд╕рдВрдкреВрд░реНрдг рдлрд┐рд▓реНрдореЛрдВ рдпрд╛ рдШрдВрдЯреЗ-рд▓рдВрдмреЗ рд╡реАрдбрд┐рдпреЛ рдХреЛ рдЙрдкрд╢реАрд░реНрд╖рдХ рджреЗрдирд╛) рдЬреЛ рдЖрдорддреМрд░ рдкрд░ рдПрдХ рдореЙрдбрд▓ рд╣реЛрддрд╛ рд╣реИ
|
||||
рдЕрдкрдиреЗ рдЖрдк рд╕рдВрднрд╛рд▓ рдирд╣реАрдВ рд╕рдХрддрд╛:
|
||||
|
||||
```python
|
||||
>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True)
|
||||
>>> transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav")
|
||||
{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. It was the day after the evening...
|
||||
```
|
||||
|
||||
рдпрджрд┐ рдЖрдкрдХреЛ рдХреЛрдИ рдРрд╕рд╛ рдкреИрд░рд╛рдореАрдЯрд░ рдирд╣реАрдВ рдорд┐рд▓ рд░рд╣рд╛ рд╣реИ рдЬреЛ рд╡рд╛рд╕реНрддрд╡ рдореЗрдВ рдЖрдкрдХреА рдорджрдж рдХрд░реЗрдЧрд╛, рддреЛ рдмреЗрдЭрд┐рдЭрдХ [рдЕрдиреБрд░реЛрдз рдХрд░реЗрдВ](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
|
||||
|
||||
|
||||
## рдбреЗрдЯрд╛рд╕реЗрдЯ рдкрд░ рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛
|
||||
|
||||
рдкрд╛рдЗрдкрд▓рд╛рдЗрди рдмрдбрд╝реЗ рдбреЗрдЯрд╛рд╕реЗрдЯ рдкрд░ рднреА рдЕрдиреБрдорд╛рди рдЪрд▓рд╛ рд╕рдХрддреА рд╣реИред рдРрд╕рд╛ рдХрд░рдиреЗ рдХрд╛ рд╕рдмрд╕реЗ рдЖрд╕рд╛рди рддрд░реАрдХрд╛ рд╣рдо рдПрдХ рдкреБрдирд░рд╛рд╡рд░реНрддрдХ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдиреЗ рдХреА рд╕рд▓рд╛рд╣ рджреЗрддреЗ рд╣реИрдВ:
|
||||
|
||||
```py
|
||||
def data():
|
||||
for i in range(1000):
|
||||
yield f"My example {i}"
|
||||
|
||||
|
||||
pipe = pipeline(model="gpt2", device=0)
|
||||
generated_characters = 0
|
||||
for out in pipe(data()):
|
||||
generated_characters += len(out[0]["generated_text"])
|
||||
```
|
||||
|
||||
рдкреБрдирд░рд╛рд╡рд░реНрддрдХ `data()` рдкреНрд░рддреНрдпреЗрдХ рдкрд░рд┐рдгрд╛рдо рдФрд░ рдкрд╛рдЗрдкрд▓рд╛рдЗрди рд╕реНрд╡рдЪрд╛рд▓рд┐рдд рд░реВрдк рд╕реЗ рдЙрддреНрдкрдиреНрди рдХрд░рддрд╛ рд╣реИ
|
||||
рдкрд╣рдЪрд╛рдирддрд╛ рд╣реИ рдХрд┐ рдЗрдирдкреБрдЯ рдкреБрдирд░рд╛рд╡рд░реНрддрдиреАрдп рд╣реИ рдФрд░ рдбреЗрдЯрд╛ рдкреНрд░рд╛рдкреНрдд рдХрд░рдирд╛ рд╢реБрд░реВ рдХрд░ рджреЗрдЧрд╛
|
||||
рдпрд╣ рдЗрд╕реЗ GPU рдкрд░ рдкреНрд░реЛрд╕реЗрд╕ рдХрд░рдирд╛ рдЬрд╛рд░реА рд░рдЦрддрд╛ рд╣реИ (рдпрд╣ рд╣реБрдб рдХреЗ рддрд╣рдд [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рддрд╛ рд╣реИ)ред
|
||||
рдпрд╣ рдорд╣рддреНрд╡рдкреВрд░реНрдг рд╣реИ рдХреНрдпреЛрдВрдХрд┐ рдЖрдкрдХреЛ рд╕рдВрдкреВрд░реНрдг рдбреЗрдЯрд╛рд╕реЗрдЯ рдХреЗ рд▓рд┐рдП рдореЗрдореЛрд░реА рдЖрд╡рдВрдЯрд┐рдд рдХрд░рдиреЗ рдХреА рдЖрд╡рд╢реНрдпрдХрддрд╛ рдирд╣реАрдВ рд╣реИ
|
||||
рдФрд░ рдЖрдк рдЬрд┐рддрдиреА рдЬрд▓реНрджреА рд╣реЛ рд╕рдХреЗ GPU рдХреЛ рдлреАрдб рдХрд░ рд╕рдХрддреЗ рд╣реИрдВред
|
||||
|
||||
рдЪреВрдВрдХрд┐ рдмреИрдЪрд┐рдВрдЧ рд╕реЗ рдЪреАрдЬрд╝реЗрдВ рддреЗрдЬрд╝ рд╣реЛ рд╕рдХрддреА рд╣реИрдВ, рдЗрд╕рд▓рд┐рдП рдпрд╣рд╛рдВ `batch_size` рдкреНрд░рд╛рдЪрд▓ рдХреЛ рдЯреНрдпреВрди рдХрд░рдиреЗ рдХрд╛ рдкреНрд░рдпрд╛рд╕ рдХрд░рдирд╛ рдЙрдкрдпреЛрдЧреА рд╣реЛ рд╕рдХрддрд╛ рд╣реИред
|
||||
|
||||
рдХрд┐рд╕реА рдбреЗрдЯрд╛рд╕реЗрдЯ рдкрд░ рдкреБрдирд░рд╛рд╡реГрддрд┐ рдХрд░рдиреЗ рдХрд╛ рд╕рдмрд╕реЗ рд╕рд░рд▓ рддрд░реАрдХрд╛ рдмрд╕ рдПрдХ рдХреЛ ЁЯдЧ [Dataset](https://github.com/huggingface/datasets/) рд╕реЗ рд▓реЛрдб рдХрд░рдирд╛ рд╣реИ:
|
||||
|
||||
```py
|
||||
# KeyDataset is a util that will just output the item we're interested in.
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
from datasets import load_dataset
|
||||
|
||||
pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
|
||||
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
|
||||
|
||||
for out in pipe(KeyDataset(dataset, "audio")):
|
||||
print(out)
|
||||
```
|
||||
|
||||
|
||||
## рд╡реЗрдмрд╕рд░реНрд╡рд░ рдХреЗ рд▓рд┐рдП рдкрд╛рдЗрдкрд▓рд╛рдЗрдиреЛрдВ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛
|
||||
|
||||
<Tip>
|
||||
рдПрдХ рдЕрдиреБрдорд╛рди рдЗрдВрдЬрди рдмрдирд╛рдирд╛ рдПрдХ рдЬрдЯрд┐рд▓ рд╡рд┐рд╖рдп рд╣реИ рдЬреЛ рдЕрдкрдиреЗ рдЖрдк рдореЗрдВ рдЙрдкрдпреБрдХреНрдд рд╣реИ
|
||||
рдкреГрд╖реНрдаред
|
||||
</Tip>
|
||||
|
||||
[Link](./pipeline_webserver)
|
||||
|
||||
## рд╡рд┐рдЬрд╝рди рдкрд╛рдЗрдкрд▓рд╛рдЗрди
|
||||
|
||||
рджреГрд╖реНрдЯрд┐ рдХрд╛рд░реНрдпреЛрдВ рдХреЗ рд▓рд┐рдП [`pipeline`] рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛ рд╡реНрдпрд╛рд╡рд╣рд╛рд░рд┐рдХ рд░реВрдк рд╕реЗ рд╕рдорд╛рди рд╣реИред
|
||||
|
||||
рдЕрдкрдирд╛ рдХрд╛рд░реНрдп рдирд┐рд░реНрджрд┐рд╖реНрдЯ рдХрд░реЗрдВ рдФрд░ рдЕрдкрдиреА рдЫрд╡рд┐ рдХреНрд▓рд╛рд╕рд┐рдлрд╛рдпрд░рд┐рдпрд░ рдХреЛ рднреЗрдЬреЗрдВред рдЫрд╡рд┐ рдПрдХ рд▓рд┐рдВрдХ, рдПрдХ рд╕реНрдерд╛рдиреАрдп рдкрде рдпрд╛ рдмреЗрд╕64-рдПрдиреНрдХреЛрдбреЗрдб рдЫрд╡рд┐ рд╣реЛ рд╕рдХрддреА рд╣реИред рдЙрджрд╛рд╣рд░рдг рдХреЗ рд▓рд┐рдП, рдмрд┐рд▓реНрд▓реА рдХреА рдХреМрди рд╕реА рдкреНрд░рдЬрд╛рддрд┐ рдиреАрдЪреЗ рджрд┐рдЦрд╛рдИ рдЧрдИ рд╣реИ?
|
||||
|
||||

|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
|
||||
>>> preds = vision_classifier(
|
||||
... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||
... )
|
||||
>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
|
||||
>>> preds
|
||||
[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
|
||||
```
|
||||
|
||||
## рдкрд╛рда рдкрд╛рдЗрдкрд▓рд╛рдЗрди
|
||||
|
||||
NLP рдХрд╛рд░реНрдпреЛрдВ рдХреЗ рд▓рд┐рдП [`pipeline`] рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛ рд╡реНрдпрд╛рд╡рд╣рд╛рд░рд┐рдХ рд░реВрдк рд╕реЗ рд╕рдорд╛рди рд╣реИред
|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> # This model is a `zero-shot-classification` model.
|
||||
>>> # It will classify text, except you are free to choose any label you might imagine
|
||||
>>> classifier = pipeline(model="facebook/bart-large-mnli")
|
||||
>>> classifier(
|
||||
... "I have a problem with my iphone that needs to be resolved asap!!",
|
||||
... candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
|
||||
... )
|
||||
{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
|
||||
```
|
||||
|
||||
## рдмрд╣реБрд╡рд┐рдз рдкрд╛рдЗрдкрд▓рд╛рдЗрди
|
||||
|
||||
[`pipeline`] рдПрдХ рд╕реЗ рдЕрдзрд┐рдХ рддреМрд░-рддрд░реАрдХреЛрдВ рдХрд╛ рд╕рдорд░реНрдерди рдХрд░рддреА рд╣реИред рдЙрджрд╛рд╣рд░рдг рдХреЗ рд▓рд┐рдП, рдПрдХ рджреГрд╢реНрдп рдкреНрд░рд╢реНрди рдЙрддреНрддрд░ (VQA) рдХрд╛рд░реНрдп рдкрд╛рда рдФрд░ рдЫрд╡рд┐ рдХреЛ рдЬреЛрдбрд╝рддрд╛ рд╣реИред рдЕрдкрдиреА рдкрд╕рдВрдж рдХреЗ рдХрд┐рд╕реА рднреА рдЫрд╡рд┐ рд▓рд┐рдВрдХ рдФрд░ рдЫрд╡рд┐ рдХреЗ рдмрд╛рд░реЗ рдореЗрдВ рдХреЛрдИ рдкреНрд░рд╢реНрди рдкреВрдЫрдиреЗ рдХреЗ рд▓рд┐рдП рд╕реНрд╡рддрдВрддреНрд░ рдорд╣рд╕реВрд╕ рдХрд░реЗрдВред рдЫрд╡рд┐ рдПрдХ URL рдпрд╛ рдЫрд╡рд┐ рдХрд╛ рд╕реНрдерд╛рдиреАрдп рдкрде рд╣реЛ рд╕рдХрддреА рд╣реИред
|
||||
|
||||
рдЙрджрд╛рд╣рд░рдг рдХреЗ рд▓рд┐рдП, рдпрджрд┐ рдЖрдк рдЗрд╕ [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png) рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рддреЗ рд╣реИрдВ:
|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> vqa = pipeline(model="impira/layoutlm-document-qa")
|
||||
>>> vqa(
|
||||
... image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
|
||||
... question="What is the invoice number?",
|
||||
... )
|
||||
[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
рдКрдкрд░ рджрд┐рдП рдЧрдП рдЙрджрд╛рд╣рд░рдг рдХреЛ рдЪрд▓рд╛рдиреЗ рдХреЗ рд▓рд┐рдП рдЖрдкрдХреЛ ЁЯдЧ рдЯреНрд░рд╛рдВрд╕рдлреЙрд░реНрдорд░ рдХреЗ рдЕрд▓рд╛рд╡рд╛ [`pytesseract`](https://pypi.org/project/pytesseract/) рдЗрдВрд╕реНрдЯреЙрд▓ рдХрд░рдирд╛ рд╣реЛрдЧрд╛:
|
||||
|
||||
```bash
|
||||
sudo apt install -y tesseract-ocr
|
||||
pip install pytesseract
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## ЁЯдЧ `рддреНрд╡рд░рдг` рдХреЗ рд╕рд╛рде рдмрдбрд╝реЗ рдореЙрдбрд▓реЛрдВ рдкрд░ `pipeline` рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдирд╛:
|
||||
|
||||
рдЖрдк ЁЯдЧ `accelerate` рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдХреЗ рдмрдбрд╝реЗ рдореЙрдбрд▓реЛрдВ рдкрд░ рдЖрд╕рд╛рдиреА рд╕реЗ `pipeline` рдЪрд▓рд╛ рд╕рдХрддреЗ рд╣реИрдВ! рдкрд╣рд▓реЗ рд╕реБрдирд┐рд╢реНрдЪрд┐рдд рдХрд░реЗрдВ рдХрд┐ рдЖрдкрдиреЗ `accelerate` рдХреЛ `pip install accelerate` рдХреЗ рд╕рд╛рде рдЗрдВрд╕реНрдЯреЙрд▓ рдХрд┐рдпрд╛ рд╣реИред
|
||||
|
||||
рд╕рдмрд╕реЗ рдкрд╣рд▓реЗ `device_map='auto'` рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдХреЗ рдЕрдкрдирд╛ рдореЙрдбрд▓ рд▓реЛрдб рдХрд░реЗрдВ! рд╣рдо рдЕрдкрдиреЗ рдЙрджрд╛рд╣рд░рдг рдХреЗ рд▓рд┐рдП `facebook/opt-1.3b` рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░реЗрдВрдЧреЗред
|
||||
|
||||
```py
|
||||
# pip install accelerate
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
|
||||
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
|
||||
```
|
||||
|
||||
рдпрджрд┐ рдЖрдк `bitsandbytes` рдЗрдВрд╕реНрдЯреЙрд▓ рдХрд░рддреЗ рд╣реИрдВ рдФрд░ `load_in_8bit=True` рддрд░реНрдХ рдЬреЛрдбрд╝рддреЗ рд╣реИрдВ рддреЛ рдЖрдк 8-рдмрд┐рдЯ рд▓реЛрдбреЗрдб рдореЙрдбрд▓ рднреА рдкрд╛рд╕ рдХрд░ рд╕рдХрддреЗ рд╣реИрдВ
|
||||
|
||||
```py
|
||||
# pip install accelerate bitsandbytes
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
|
||||
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
|
||||
```
|
||||
|
||||
рдзреНрдпрд╛рди рджреЗрдВ рдХрд┐ рдЖрдк рдЪреЗрдХрдкреЙрдЗрдВрдЯ рдХреЛ рдХрд┐рд╕реА рднреА рд╣рдЧрд┐рдВрдЧ рдлреЗрд╕ рдореЙрдбрд▓ рд╕реЗ рдмрджрд▓ рд╕рдХрддреЗ рд╣реИрдВ рдЬреЛ BLOOM рдЬреИрд╕реЗ рдмрдбрд╝реЗ рдореЙрдбрд▓ рд▓реЛрдбрд┐рдВрдЧ рдХрд╛ рд╕рдорд░реНрдерди рдХрд░рддрд╛ рд╣реИ!
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user