mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-05 04:34:37 +08:00
Compare commits
457 Commits
arijitx/wa
...
relative-p
| Author | SHA1 | Date | |
|---|---|---|---|
| da72c2cdde | |||
| 50415b84d6 | |||
| 7f14839f55 | |||
| 242cc6e265 | |||
| b76290f44c | |||
| d453ea6120 | |||
| 120649bf3a | |||
| 7ec9128e5a | |||
| edb672ac5e | |||
| bd43151af4 | |||
| 3960ce917f | |||
| 9068fa6c57 | |||
| 53496ac510 | |||
| 3b29c9fdb7 | |||
| df15703b42 | |||
| a72f1c9f5b | |||
| 1690094bdb | |||
| 4aabf9b52c | |||
| 457d4a3245 | |||
| 5483388631 | |||
| a1344dbfb9 | |||
| 73083581a4 | |||
| c1daf724ea | |||
| 66336dc183 | |||
| a5282ab4bc | |||
| 224bde91ca | |||
| 39e146146b | |||
| 13e875cc07 | |||
| b4eef63a1d | |||
| 5e428b71b4 | |||
| 3114df41f4 | |||
| c99ddcc441 | |||
| 35b16032cb | |||
| b88090914d | |||
| fd1e67033e | |||
| cdaed367b0 | |||
| 2bc305107a | |||
| 1d463303fe | |||
| 49becbaa55 | |||
| 6e93d94792 | |||
| af4a1ecad0 | |||
| e0b58fb5ba | |||
| df1ec6b122 | |||
| fba0b6a820 | |||
| da0bed5f4a | |||
| 75343de938 | |||
| c38f4e1f1c | |||
| 90ed9ae2d1 | |||
| c70dacde94 | |||
| 2351729f7d | |||
| 29080643eb | |||
| 9fc34235fa | |||
| e0be053e43 | |||
| 5323094a22 | |||
| ca2a55e9df | |||
| dfc76b2542 | |||
| 66e8656778 | |||
| e9d5138768 | |||
| e160a5dd62 | |||
| 7d0b6fc340 | |||
| ee82c86bdc | |||
| 34097b3304 | |||
| ae7bae8fe7 | |||
| 264128cb9d | |||
| 9d99489f2f | |||
| 78c695eb62 | |||
| c6cea5a78c | |||
| 119e3c0fc8 | |||
| 706bb8364d | |||
| 5c8f601007 | |||
| 3cab90279f | |||
| 9e72eb4416 | |||
| b118730745 | |||
| b6a65ae52a | |||
| 9aa230aa2f | |||
| ad71965246 | |||
| 19a8a3036d | |||
| d28b7aa8cb | |||
| 34a886fce3 | |||
| 2e37ef35d1 | |||
| f6ad0e0556 | |||
| 4aed1dc81b | |||
| da71df1afc | |||
| 26e5e129b4 | |||
| 72f5b94984 | |||
| c4e58cd8ba | |||
| 254d9c068e | |||
| 8343901263 | |||
| 1c57242d7b | |||
| babeff5524 | |||
| 5c17918fe4 | |||
| 607acd4fbd | |||
| 1c220ced8e | |||
| 013462c57b | |||
| 2f59ad1609 | |||
| 046c5ea906 | |||
| 085321c9a1 | |||
| 048dd73bba | |||
| 588d8f1f26 | |||
| f128ccb997 | |||
| 216499bfcc | |||
| 659b27fd26 | |||
| 0932adb3e8 | |||
| 58fb3c9f98 | |||
| ca1f1c8685 | |||
| 3766df4fe1 | |||
| 028d4b7c8b | |||
| 84aaadd8c5 | |||
| 1d2b57b8a2 | |||
| 693720e567 | |||
| 4d1ce39683 | |||
| 4390151ba2 | |||
| 6813439fdc | |||
| 3042ea4f6f | |||
| bdc01711d6 | |||
| b1160c0b56 | |||
| d91da4c6df | |||
| 24092b1464 | |||
| 811da2b8c2 | |||
| 4f38808e9e | |||
| ba286fe7d5 | |||
| 7822a9b7a7 | |||
| f394a2a50d | |||
| 6ee1474b67 | |||
| 52e7c92920 | |||
| dfc38463b8 | |||
| 8f8b3cbce4 | |||
| 400b30936a | |||
| 5af38953bb | |||
| 567d9c061d | |||
| 975dd2bbbc | |||
| c1a138613d | |||
| b0e0ac8a67 | |||
| 2ef09ecfb8 | |||
| 28d0048218 | |||
| 04681c1d81 | |||
| 13fd67346a | |||
| d156898f3b | |||
| 7999ec125f | |||
| 98f6e1ee87 | |||
| 7535d92e71 | |||
| 2295bcaea8 | |||
| 8f46ac9849 | |||
| 5e7f085fcc | |||
| 70484a8d74 | |||
| a9eca74372 | |||
| 740a1574f1 | |||
| 284fc6c0bb | |||
| 35e2d13f3c | |||
| 897a8dd89f | |||
| 31484afbed | |||
| 56b35ce3eb | |||
| bd908e9bb1 | |||
| 4d727bd2df | |||
| 1ef9a1ed4a | |||
| 71e602725b | |||
| 374a2f693f | |||
| d980929803 | |||
| 31ee80d556 | |||
| 13541b4aa2 | |||
| 71cced8ae3 | |||
| 56f50590d5 | |||
| 2e7e4280aa | |||
| 1cd01b0af3 | |||
| c86aad6110 | |||
| 7b8cb26953 | |||
| b48ac1a094 | |||
| b9bb417324 | |||
| 3fd7de49f4 | |||
| 54192058f3 | |||
| 48c22691e3 | |||
| 5d6feecf16 | |||
| 518bd02c9b | |||
| e8714c0307 | |||
| 2b282296f1 | |||
| a4386d7e40 | |||
| 3601aa8fc9 | |||
| 1b20c970a2 | |||
| 6aad3872ce | |||
| 1762ded30a | |||
| 6e195eb9de | |||
| 060fe61dff | |||
| b3b9f99ed2 | |||
| 6da76b9c2a | |||
| adc0ff2502 | |||
| 4710702837 | |||
| 5fdb54ece7 | |||
| 91ede485a7 | |||
| fe28eb9452 | |||
| 2cb2ea3fa1 | |||
| 1c9d1f4ca8 | |||
| 60ad73448c | |||
| 7ba1d4e51f | |||
| d6b8e9cec7 | |||
| c35264007b | |||
| d9050dc768 | |||
| bad358398a | |||
| 0511305549 | |||
| 032d63b976 | |||
| 986dd5c5bf | |||
| 38ddab10da | |||
| 10704e1209 | |||
| 28a0811652 | |||
| 1f13ba818e | |||
| 349f1c85d3 | |||
| 651e48e1e5 | |||
| 6d211429ec | |||
| ec7f8af106 | |||
| a26ab95e30 | |||
| 1ac2b8fa7f | |||
| 5a9957358c | |||
| f0395cf58e | |||
| e705e1267c | |||
| f6a6388972 | |||
| 6cb7187324 | |||
| 053a80c606 | |||
| 8600d770d4 | |||
| 3fb82f74fd | |||
| 9b0d2860eb | |||
| 66b3e106a1 | |||
| ddb1a47ec8 | |||
| 2f611f85e2 | |||
| 95b6bef624 | |||
| a5d1839679 | |||
| 05a90579a8 | |||
| e730e12567 | |||
| 518dd1277e | |||
| 71d18d0831 | |||
| 71abd3ade1 | |||
| d3d87b451e | |||
| e86faecfd4 | |||
| ee393c009a | |||
| 16be422912 | |||
| f9024814e1 | |||
| 50d1867cf8 | |||
| 506899d147 | |||
| 7198b63362 | |||
| b96cb1693f | |||
| 9c8fde8e19 | |||
| 993553b2f1 | |||
| 38043d8453 | |||
| 18d6b356c5 | |||
| dfc76018c1 | |||
| 85fc455972 | |||
| 3f936df662 | |||
| afe5d42d8d | |||
| 9bd67ac7bb | |||
| 30be0da5da | |||
| f04257fdbc | |||
| 5294fa12ee | |||
| 9f16a1cc13 | |||
| a42242da7c | |||
| b971c769e8 | |||
| 8c7481f35c | |||
| 1a688709b3 | |||
| b17b78897b | |||
| 1a66a6c677 | |||
| e2d678b71c | |||
| ae82da2181 | |||
| 36ddcc0d35 | |||
| d1d5ebb16c | |||
| a10f61834d | |||
| 7b95825d7d | |||
| 934e21cd4b | |||
| 47412c7d43 | |||
| c76afa511c | |||
| edcc66d27c | |||
| c33f6046c3 | |||
| 5229744b26 | |||
| 6bc6797e04 | |||
| 0a2bea4752 | |||
| 0645b07daf | |||
| 0e6ec2a469 | |||
| 63517fdf48 | |||
| 4a419d4995 | |||
| 48a8f3daa1 | |||
| 4ad2f68e34 | |||
| e99f0efedc | |||
| 976835d515 | |||
| 259eeb6dab | |||
| f861504466 | |||
| 9aeacfe0ff | |||
| 1766fa2159 | |||
| 6d80c92c77 | |||
| d719bcd46a | |||
| 766d4bf792 | |||
| 2fbb237967 | |||
| df735d1317 | |||
| 7783fa6bb3 | |||
| 05fc1766ff | |||
| dc3645dc9c | |||
| a021f2b90c | |||
| e9fd583ce0 | |||
| 3212afa614 | |||
| 215e0681e4 | |||
| 351cdbdfdc | |||
| cad61b6839 | |||
| a59eb349c5 | |||
| dd16a113a4 | |||
| c849a61e65 | |||
| 99289c08a1 | |||
| 45360e1a8e | |||
| db377a0b37 | |||
| 6dc4c36acb | |||
| 23619ef6b7 | |||
| 870e6f29a6 | |||
| 279bc5849b | |||
| ef20390291 | |||
| bb8d40529e | |||
| 9c5ae87f13 | |||
| 2bf95e2b09 | |||
| 7a229ef446 | |||
| 049e791758 | |||
| d76d2a2af7 | |||
| 675e2d1663 | |||
| 4bb1d0ec84 | |||
| db034660fb | |||
| 39f8eafc1b | |||
| dd739f7045 | |||
| 1c9fcd0e04 | |||
| 6e17ba6aa5 | |||
| a8fa2f91f4 | |||
| 19420fd99e | |||
| cd9274d010 | |||
| 31616b8d61 | |||
| 1073f00d4e | |||
| 30ca529902 | |||
| bb2e088be7 | |||
| 1ac698744c | |||
| f275e593bf | |||
| 35d48db881 | |||
| daecae1f1c | |||
| 2de2c9ecca | |||
| 4be8b95a9f | |||
| bdd690a74d | |||
| 9586e222af | |||
| 93b802c43e | |||
| 1ae182d9a6 | |||
| 2c2a2169b6 | |||
| ff846e9b28 | |||
| eb877f1fd0 | |||
| da47c264f9 | |||
| ede5e04191 | |||
| 7152ed2bae | |||
| 18df440709 | |||
| b8dffd1f3e | |||
| 4f3a14e3c2 | |||
| 20fb5d51ea | |||
| 63fbed5c59 | |||
| fb0ae12947 | |||
| 57e6464ac9 | |||
| e952e049b4 | |||
| e6f00a11d7 | |||
| 3486a92a57 | |||
| 5af5735f62 | |||
| 01562dac7e | |||
| 1be8d56ec6 | |||
| 1f9e862507 | |||
| dced262409 | |||
| 992996e9ca | |||
| 596afb4297 | |||
| 691cdbb7d7 | |||
| 60e1d883f1 | |||
| c79bbc3ba5 | |||
| bfbec17765 | |||
| cf8a7c2490 | |||
| 5896b3ecce | |||
| 084c38c59d | |||
| c82e017aa9 | |||
| 49d5bcb0f3 | |||
| 479fdc4925 | |||
| a4a88fa09f | |||
| 2d91e3c304 | |||
| d365f5074f | |||
| 10dfa126b7 | |||
| aaee4038c3 | |||
| 8afaaa26f5 | |||
| fa32247406 | |||
| 344b9fb0c6 | |||
| 6568752039 | |||
| fea94d6790 | |||
| e03966e404 | |||
| 8246caf3eb | |||
| 9331b37967 | |||
| 809dac48f9 | |||
| f6210c49e2 | |||
| 32adbb26d6 | |||
| 3e47d19cfc | |||
| 3a71e94a92 | |||
| 508baf1943 | |||
| 72728be3db | |||
| 22fc93c4d9 | |||
| 99c8226b12 | |||
| ec81c11a18 | |||
| 0d1cff1195 | |||
| 9fa88172c2 | |||
| 6d90d76f5d | |||
| 3b1bbefc47 | |||
| d91841315a | |||
| eef2422e96 | |||
| 0b1e0fcf7a | |||
| bae9b6458c | |||
| cb555af2c7 | |||
| e789418ebe | |||
| daf520b033 | |||
| cb7e166428 | |||
| 6620f60c0a | |||
| 705d65368f | |||
| 175da8d182 | |||
| 67ed0e43dc | |||
| afa1ef0992 | |||
| e13a91fe60 | |||
| ff06b17791 | |||
| e1c153cbaa | |||
| 3104036e7f | |||
| e6d23a4b9b | |||
| 3dd57b15c5 | |||
| 1efca4e6c8 | |||
| b5c6a63ed9 | |||
| f09c45e067 | |||
| 74814574ae | |||
| b96e82c80a | |||
| db9f189121 | |||
| 77de8d6c31 | |||
| b74a955325 | |||
| 3663fca41b | |||
| a2392415e9 | |||
| d3bd9ac728 | |||
| 6de4ee61a0 | |||
| 306c9ee966 | |||
| 7db7aab439 | |||
| 494c2a8c4d | |||
| 989a15d173 | |||
| c11a49573f | |||
| 51e0ebedcb | |||
| 51fa7191b1 | |||
| 8d3f952adb | |||
| 33cd4be576 | |||
| 9a2995ee39 | |||
| d3c9d0e55f | |||
| 3785f4665a | |||
| 6984848ed0 | |||
| 438144832e | |||
| b4ddd2677c | |||
| 02de7a8e7f | |||
| dee6f01636 | |||
| 78f346c2b5 | |||
| ee209d4d01 | |||
| 5da33f8729 | |||
| ce2fef2ad2 | |||
| 1b7de41a07 | |||
| de8b06f9bf | |||
| 048443db86 | |||
| 3e4eec47f5 | |||
| c21e1071a7 | |||
| 89293a0f6b | |||
| b151ddb9b9 |
@ -78,12 +78,14 @@ jobs:
|
||||
keys:
|
||||
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs
|
||||
- run: git lfs install
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
|
||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
|
||||
- run: pip install tensorflow_probability
|
||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
- run: pip install git+https://github.com/huggingface/accelerate
|
||||
- save_cache:
|
||||
key: v0.4-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -93,7 +95,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf $(cat test_list.txt) -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf $(cat test_list.txt) -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -116,18 +118,20 @@ jobs:
|
||||
keys:
|
||||
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng git-lfs
|
||||
- run: git lfs install
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
|
||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
|
||||
- run: pip install tensorflow_probability
|
||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
- run: pip install git+https://github.com/huggingface/accelerate
|
||||
- save_cache:
|
||||
key: v0.4-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf tests -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf tests -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -136,7 +140,7 @@ jobs:
|
||||
run_tests_torch_and_flax:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
RUN_PT_FLAX_CROSS_TESTS: yes
|
||||
@ -154,6 +158,7 @@ jobs:
|
||||
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
|
||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
|
||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
- run: pip install git+https://github.com/huggingface/accelerate
|
||||
- save_cache:
|
||||
key: v0.4-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -163,7 +168,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax $(cat test_list.txt) -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax $(cat test_list.txt) -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -173,7 +178,7 @@ jobs:
|
||||
run_tests_torch_and_flax_all:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
RUN_PT_FLAX_CROSS_TESTS: yes
|
||||
@ -191,12 +196,13 @@ jobs:
|
||||
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
|
||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
|
||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
- run: pip install git+https://github.com/huggingface/accelerate
|
||||
- save_cache:
|
||||
key: v0.4-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax tests -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax tests -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -217,11 +223,12 @@ jobs:
|
||||
keys:
|
||||
- v0.4-torch-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng time
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
|
||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
- run: pip install git+https://github.com/huggingface/accelerate
|
||||
- save_cache:
|
||||
key: v0.4-torch-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -231,7 +238,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch $(cat test_list.txt) | tee tests_output.txt
|
||||
python -m pytest -n 3 --max-worker-restart=0 --dist=loadfile -s --make-reports=tests_torch $(cat test_list.txt) | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -258,12 +265,13 @@ jobs:
|
||||
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html
|
||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
- run: pip install git+https://github.com/huggingface/accelerate
|
||||
- save_cache:
|
||||
key: v0.4-torch-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch tests | tee tests_output.txt
|
||||
python -m pytest -n 3 --max-worker-restart=0 --dist=loadfile -s --make-reports=tests_torch tests | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -298,7 +306,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf $(cat test_list.txt) | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_tf $(cat test_list.txt) | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -330,7 +338,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf tests | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_tf tests | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -364,7 +372,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax $(cat test_list.txt) | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_flax $(cat test_list.txt) | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -395,7 +403,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax tests | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_flax tests | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -431,7 +439,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test $(cat test_list.txt) | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test $(cat test_list.txt) | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -464,7 +472,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test tests | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test tests | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -498,7 +506,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf $(cat test_list.txt) -m is_pipeline_test | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf $(cat test_list.txt) -m is_pipeline_test | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -529,7 +537,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf tests -m is_pipeline_test | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf tests -m is_pipeline_test | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -549,7 +557,7 @@ jobs:
|
||||
- v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy]
|
||||
- run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]
|
||||
- run: python -m unidic download
|
||||
- save_cache:
|
||||
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||
@ -557,11 +565,11 @@ jobs:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
|
||||
python -m pytest --max-worker-restart=0 -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
|
||||
fi
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 1 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
|
||||
python -m pytest -n 1 --max-worker-restart=0 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -571,7 +579,7 @@ jobs:
|
||||
run_examples_torch:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
@ -587,7 +595,6 @@ jobs:
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
|
||||
- run: pip install -r examples/pytorch/_tests_requirements.txt
|
||||
- run: pip install git+https://github.com/huggingface/accelerate
|
||||
- save_cache:
|
||||
key: v0.4-torch_examples-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -597,7 +604,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/examples_output.txt
|
||||
@ -607,7 +614,7 @@ jobs:
|
||||
run_examples_torch_all:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
@ -628,7 +635,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
|
||||
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/examples_output.txt
|
||||
- store_artifacts:
|
||||
@ -661,7 +668,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_flax ./examples/flax/ | tee tests_output.txt
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_flax ./examples/flax/ | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/flax_examples_output.txt
|
||||
@ -691,7 +698,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_flax ./examples/flax/ | tee examples_output.txt
|
||||
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_flax ./examples/flax/ | tee examples_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/flax_examples_output.txt
|
||||
- store_artifacts:
|
||||
@ -728,7 +735,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -sv --make-reports=tests_hub $(cat test_list.txt) -m is_staging_test | tee tests_output.txt
|
||||
python -m pytest --max-worker-restart=0 -sv --make-reports=tests_hub $(cat test_list.txt) -m is_staging_test | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -762,7 +769,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -sv --make-reports=tests_hub tests -m is_staging_test | tee tests_output.txt
|
||||
python -m pytest --max-worker-restart=0 -sv --make-reports=tests_hub tests -m is_staging_test | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -784,7 +791,7 @@ jobs:
|
||||
- v0.4-torch-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[torch,testing,sentencepiece,onnxruntime,vision]
|
||||
- run: pip install .[torch,testing,sentencepiece,onnxruntime,vision,rjieba]
|
||||
- save_cache:
|
||||
key: v0.4-onnx-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -794,7 +801,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx $(cat test_list.txt) -k onnx | tee tests_output.txt
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s --make-reports=tests_onnx $(cat test_list.txt) -k onnx | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -823,7 +830,7 @@ jobs:
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx tests -k onnx | tee tests_output.txt
|
||||
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s --make-reports=tests_onnx tests -k onnx | tee tests_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
- store_artifacts:
|
||||
@ -832,7 +839,7 @@ jobs:
|
||||
check_code_quality:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
resource_class: large
|
||||
environment:
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
@ -849,16 +856,17 @@ jobs:
|
||||
key: v0.4-code_quality-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: black --check examples tests src utils
|
||||
- run: black --check --preview examples tests src utils
|
||||
- run: isort --check-only examples tests src utils
|
||||
- run: python utils/custom_init_isort.py --check_only
|
||||
- run: python utils/sort_auto_mappings.py --check_only
|
||||
- run: flake8 examples tests src utils
|
||||
- run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
|
||||
check_repository_consistency:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
resource_class: large
|
||||
environment:
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
@ -880,10 +888,11 @@ jobs:
|
||||
- run: python utils/check_dummies.py
|
||||
- run: python utils/check_repo.py
|
||||
- run: python utils/check_inits.py
|
||||
- run: python utils/check_config_docstrings.py
|
||||
- run: make deps_table_check_updated
|
||||
- run: python utils/tests_fetcher.py --sanity_check
|
||||
|
||||
run_tests_layoutlmv2:
|
||||
run_tests_layoutlmv2_and_v3:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.7
|
||||
@ -914,7 +923,7 @@ jobs:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 1 tests/*layoutlmv2* --dist=loadfile -s --make-reports=tests_layoutlmv2 --durations=100
|
||||
python -m pytest -n 1 --max-worker-restart=0 tests/models/*layoutlmv* --dist=loadfile -s --make-reports=tests_layoutlmv2_and_v3 --durations=100
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tests_output.txt
|
||||
@ -924,7 +933,7 @@ jobs:
|
||||
# TPU JOBS
|
||||
run_examples_tpu:
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
@ -944,7 +953,7 @@ jobs:
|
||||
|
||||
cleanup-gke-jobs:
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
- image: circleci/python:3.7
|
||||
steps:
|
||||
- gcp-gke/install
|
||||
- gcp-gke/update-kubeconfig-with-credentials:
|
||||
@ -975,7 +984,7 @@ workflows:
|
||||
- run_tests_pipelines_tf
|
||||
- run_tests_onnxruntime
|
||||
- run_tests_hub
|
||||
- run_tests_layoutlmv2
|
||||
- run_tests_layoutlmv2_and_v3
|
||||
nightly:
|
||||
triggers:
|
||||
- schedule:
|
||||
|
||||
3
.gitattributes
vendored
3
.gitattributes
vendored
@ -1,3 +1,4 @@
|
||||
*.py eol=lf
|
||||
*.rst eol=lf
|
||||
*.md eol=lf
|
||||
*.md eol=lf
|
||||
*.mdx eol=lf
|
||||
22
.github/ISSUE_TEMPLATE/---new-benchmark.md
vendored
22
.github/ISSUE_TEMPLATE/---new-benchmark.md
vendored
@ -1,22 +0,0 @@
|
||||
---
|
||||
name: "\U0001F5A5 New benchmark"
|
||||
about: Benchmark a part of this library and share your results
|
||||
title: "[Benchmark]"
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# 🖥 Benchmarking `transformers`
|
||||
|
||||
## Benchmark
|
||||
|
||||
Which part of `transformers` did you benchmark?
|
||||
|
||||
## Set-up
|
||||
|
||||
What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
|
||||
|
||||
## Results
|
||||
|
||||
Put your results here!
|
||||
20
.github/ISSUE_TEMPLATE/--new-model-addition.md
vendored
20
.github/ISSUE_TEMPLATE/--new-model-addition.md
vendored
@ -1,20 +0,0 @@
|
||||
---
|
||||
name: "\U0001F31F New model addition"
|
||||
about: Submit a proposal/request to implement a new Transformer-based model
|
||||
title: ''
|
||||
labels: New model
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# 🌟 New model addition
|
||||
|
||||
## Model description
|
||||
|
||||
<!-- Important information -->
|
||||
|
||||
## Open source status
|
||||
|
||||
* [ ] the model implementation is available: (give details)
|
||||
* [ ] the model weights are available: (give details)
|
||||
* [ ] who are the authors: (mention them, if possible by @gh-username)
|
||||
107
.github/ISSUE_TEMPLATE/bug-report.md
vendored
107
.github/ISSUE_TEMPLATE/bug-report.md
vendored
@ -1,107 +0,0 @@
|
||||
---
|
||||
name: "\U0001F41B Bug Report"
|
||||
about: Submit a bug report to help us improve transformers
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
|
||||
## Environment info
|
||||
<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
|
||||
Don't forget to fill out the missing fields in that output! -->
|
||||
|
||||
- `transformers` version:
|
||||
- Platform:
|
||||
- Python version:
|
||||
- PyTorch version (GPU?):
|
||||
- Tensorflow version (GPU?):
|
||||
- Using GPU in script?:
|
||||
- Using distributed or parallel set-up in script?:
|
||||
|
||||
### Who can help
|
||||
<!-- Your issue will be replied to more quickly if you can figure out the right person to tag with @
|
||||
If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
|
||||
Please tag fewer than 3 people.
|
||||
|
||||
Models:
|
||||
|
||||
- ALBERT, BERT, XLM, DeBERTa, DeBERTa-v2, ELECTRA, MobileBert, SqueezeBert: @LysandreJik
|
||||
- T5, Pegasus, EncoderDecoder: @patrickvonplaten
|
||||
- Blenderbot, MBART, BART, Marian, Pegasus: @patil-suraj
|
||||
- Reformer, TransfoXL, XLNet, FNet: @patrickvonplaten
|
||||
- Longformer, BigBird: @ydshieh
|
||||
- FSMT: @stas00
|
||||
- Funnel: @sgugger
|
||||
- GPT-2, GPT: @patil-suraj, @patrickvonplaten, @LysandreJik
|
||||
- RAG, DPR: @patrickvonplaten, @lhoestq
|
||||
- TensorFlow: @Rocketknight1
|
||||
- JAX/Flax: @patil-suraj
|
||||
- TAPAS, LayoutLM, LayoutLMv2, LUKE, ViT, BEiT, DEiT, DETR, CANINE: @NielsRogge
|
||||
- GPT-Neo, GPT-J, CLIP: @patil-suraj
|
||||
- Wav2Vec2, HuBERT, SpeechEncoderDecoder, UniSpeech, UniSpeechSAT, SEW, SEW-D, Speech2Text: @patrickvonplaten, @anton-l
|
||||
|
||||
If the model isn't in the list, ping @LysandreJik who will redirect you to the correct contributor.
|
||||
|
||||
Library:
|
||||
|
||||
- Benchmarks: @patrickvonplaten
|
||||
- Deepspeed: @stas00
|
||||
- Ray/raytune: @richardliaw, @amogkam
|
||||
- Text generation: @patrickvonplaten @narsil
|
||||
- Tokenizers: @SaulLu
|
||||
- Trainer: @sgugger
|
||||
- Pipelines: @Narsil
|
||||
- Speech: @patrickvonplaten, @anton-l
|
||||
- Vision: @NielsRogge, @sgugger
|
||||
|
||||
Documentation: @sgugger
|
||||
|
||||
Model hub:
|
||||
|
||||
- for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator.
|
||||
|
||||
HF projects:
|
||||
|
||||
- datasets: [different repo](https://github.com/huggingface/datasets)
|
||||
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
|
||||
|
||||
Examples:
|
||||
|
||||
- maintained examples (not research project or legacy): @sgugger, @patil-suraj
|
||||
|
||||
For research projetcs, please ping the contributor directly. For example, on the following projects:
|
||||
|
||||
- research_projects/bert-loses-patience: @JetRunner
|
||||
- research_projects/distillation: @VictorSanh
|
||||
|
||||
-->
|
||||
|
||||
## Information
|
||||
|
||||
Model I am using (Bert, XLNet ...):
|
||||
|
||||
The problem arises when using:
|
||||
* [ ] the official example scripts: (give details below)
|
||||
* [ ] my own modified scripts: (give details below)
|
||||
|
||||
The tasks I am working on is:
|
||||
* [ ] an official GLUE/SQUaD task: (give the name)
|
||||
* [ ] my own task or dataset: (give details below)
|
||||
|
||||
## To reproduce
|
||||
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
<!-- If you have code snippets, error messages, stack traces please provide them here as well.
|
||||
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->
|
||||
|
||||
## Expected behavior
|
||||
|
||||
<!-- A clear and concise description of what you would expect to happen. -->
|
||||
121
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
121
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
name: "\U0001F41B Bug Report"
|
||||
description: Submit a bug report to help us improve transformers
|
||||
labels: [ "bug" ]
|
||||
body:
|
||||
- type: textarea
|
||||
id: system-info
|
||||
attributes:
|
||||
label: System Info
|
||||
description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
|
||||
render: shell
|
||||
placeholder: transformers version, platform, python version, ...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: who-can-help
|
||||
attributes:
|
||||
label: Who can help?
|
||||
description: |
|
||||
Your issue will be replied to more quickly if you can figure out the right person to tag with @
|
||||
If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
|
||||
Please tag fewer than 3 people.
|
||||
|
||||
Models:
|
||||
|
||||
- ALBERT, BERT, XLM, DeBERTa, DeBERTa-v2, ELECTRA, MobileBert, SqueezeBert: `@LysandreJik`
|
||||
- T5, Pegasus, EncoderDecoder: `@patrickvonplaten`
|
||||
- Blenderbot, MBART, BART, Marian, Pegasus: `@patil-suraj`
|
||||
- Reformer, TransfoXL, XLNet, FNet: `@patrickvonplaten`
|
||||
- Longformer, BigBird: `@ydshieh`
|
||||
- FSMT: `@stas00`
|
||||
- Funnel: `@sgugger`
|
||||
- GPT-2, GPT: `@patil-suraj`, `@patrickvonplaten`, `@LysandreJik`
|
||||
- RAG, DPR: `@patrickvonplaten`, `@lhoestq`
|
||||
- TensorFlow: `@Rocketknight1`
|
||||
- JAX/Flax: `@patil-suraj`
|
||||
- TAPAS, LayoutLM, LayoutLMv2, LUKE, ViT, BEiT, DEiT, DETR, CANINE: `@NielsRogge`
|
||||
- GPT-Neo, GPT-J, CLIP: `@patil-suraj`
|
||||
- Wav2Vec2, HuBERT, UniSpeech, UniSpeechSAT, SEW, SEW-D: `@patrickvonplaten`, `@anton-l`
|
||||
- SpeechEncoderDecoder, Speech2Text, Speech2Text2: `@sanchit-gandhi`, `@patrickvonplaten`, `@anton-l`
|
||||
|
||||
If the model isn't in the list, ping `@LysandreJik` who will redirect you to the correct contributor.
|
||||
|
||||
Library:
|
||||
- Benchmarks: `@patrickvonplaten`
|
||||
- Deepspeed: `@stas00`
|
||||
- Ray/raytune: `@richardliaw`, `@amogkam`
|
||||
- Text generation: `@patrickvonplaten`, `@Narsil`, `@gante`
|
||||
- Tokenizers: `@SaulLu`
|
||||
- Trainer: `@sgugger`
|
||||
- Pipelines: `@Narsil`
|
||||
- Speech: `@patrickvonplaten`, `@anton-l`, `@sanchit-gandhi`
|
||||
- Vision: `@NielsRogge`, `@sgugger`
|
||||
|
||||
Documentation: `@sgugger`, `@stevhliu`
|
||||
|
||||
Model hub:
|
||||
|
||||
- for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator.
|
||||
|
||||
HF projects:
|
||||
|
||||
- datasets: [different repo](https://github.com/huggingface/datasets)
|
||||
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
|
||||
|
||||
Examples:
|
||||
|
||||
- maintained examples (not research project or legacy): `@sgugger`, `@patil-suraj`
|
||||
|
||||
For research projetcs, please ping the contributor directly. For example, on the following projects:
|
||||
|
||||
- research_projects/bert-loses-patience: `@JetRunner`
|
||||
- research_projects/distillation: `@VictorSanh`
|
||||
placeholder: "@Username ..."
|
||||
|
||||
- type: checkboxes
|
||||
id: information-scripts-examples
|
||||
attributes:
|
||||
label: Information
|
||||
description: 'The problem arises when using:'
|
||||
options:
|
||||
- label: "The official example scripts"
|
||||
- label: "My own modified scripts"
|
||||
|
||||
- type: checkboxes
|
||||
id: information-tasks
|
||||
attributes:
|
||||
label: Tasks
|
||||
description: "The tasks I am working on are:"
|
||||
options:
|
||||
- label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
|
||||
- label: "My own task or dataset (give details below)"
|
||||
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Reproduction
|
||||
description: |
|
||||
Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
|
||||
If you have code snippets, error messages, stack traces please provide them here as well.
|
||||
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
|
||||
|
||||
placeholder: |
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
|
||||
- type: textarea
|
||||
id: expected-behavior
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Expected behavior
|
||||
description: "A clear and concise description of what you would expect to happen."
|
||||
render: shell
|
||||
12
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
12
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
blank_issues_enabled: true
|
||||
version: 2.1
|
||||
contact_links:
|
||||
- name: Model checkpoints on the Hugging Face Hub
|
||||
url: https://huggingface.co/models
|
||||
about: Open a Pull request / Discussion related to a specific model checkpoint directly on the Hugging Face Hub
|
||||
- name: Website Related
|
||||
url: https://github.com/huggingface/hub-docs/issues
|
||||
about: Feature requests and bug reports related to the website
|
||||
- name: Forum
|
||||
url: https://discuss.huggingface.co/
|
||||
about: General usage questions and community discussions
|
||||
25
.github/ISSUE_TEMPLATE/feature-request.md
vendored
25
.github/ISSUE_TEMPLATE/feature-request.md
vendored
@ -1,25 +0,0 @@
|
||||
---
|
||||
name: "\U0001F680 Feature request"
|
||||
about: Submit a proposal/request for a new transformers feature
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# 🚀 Feature request
|
||||
|
||||
<!-- A clear and concise description of the feature proposal.
|
||||
Please provide a link to the paper and code in case they exist. -->
|
||||
|
||||
## Motivation
|
||||
|
||||
<!-- Please outline the motivation for the proposal. Is your feature request
|
||||
related to a problem? e.g., I'm always frustrated when [...]. If this is related
|
||||
to another GitHub issue, please link here too. -->
|
||||
|
||||
## Your contribution
|
||||
|
||||
<!-- Is there any way that you could help, e.g. by submitting a PR?
|
||||
Make sure to read the CONTRIBUTING.MD readme:
|
||||
https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md -->
|
||||
31
.github/ISSUE_TEMPLATE/feature-request.yml
vendored
Normal file
31
.github/ISSUE_TEMPLATE/feature-request.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
name: "\U0001F680 Feature request"
|
||||
description: Submit a proposal/request for a new transformers feature
|
||||
labels: [ "feature" ]
|
||||
body:
|
||||
- type: textarea
|
||||
id: feature-request
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Feature request
|
||||
description: |
|
||||
A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
|
||||
|
||||
- type: textarea
|
||||
id: motivation
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Motivation
|
||||
description: |
|
||||
Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
|
||||
|
||||
|
||||
- type: textarea
|
||||
id: contribution
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Your contribution
|
||||
description: |
|
||||
Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md)
|
||||
58
.github/ISSUE_TEMPLATE/migration.md
vendored
58
.github/ISSUE_TEMPLATE/migration.md
vendored
@ -1,58 +0,0 @@
|
||||
---
|
||||
name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
|
||||
about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers
|
||||
to transformers
|
||||
title: ''
|
||||
labels: Migration
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# 📚 Migration
|
||||
|
||||
## Information
|
||||
|
||||
<!-- Important information -->
|
||||
|
||||
Model I am using (Bert, XLNet ...):
|
||||
|
||||
Language I am using the model on (English, Chinese ...):
|
||||
|
||||
The problem arises when using:
|
||||
* [ ] the official example scripts: (give details below)
|
||||
* [ ] my own modified scripts: (give details below)
|
||||
|
||||
The tasks I am working on is:
|
||||
* [ ] an official GLUE/SQUaD task: (give the name)
|
||||
* [ ] my own task or dataset: (give details below)
|
||||
|
||||
## Details
|
||||
|
||||
<!-- A clear and concise description of the migration issue.
|
||||
If you have code snippets, please provide it here as well.
|
||||
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
|
||||
-->
|
||||
|
||||
## Environment info
|
||||
<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
|
||||
Don't forget to fill out the missing fields in that output! -->
|
||||
|
||||
- `transformers` version:
|
||||
- Platform:
|
||||
- Python version:
|
||||
- PyTorch version (GPU?):
|
||||
- Tensorflow version (GPU?):
|
||||
- Using GPU in script?:
|
||||
- Using distributed or parallel set-up in script?:
|
||||
|
||||
<!-- IMPORTANT: which version of the former library do you use? -->
|
||||
* `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):
|
||||
|
||||
|
||||
## Checklist
|
||||
|
||||
- [ ] I have read the migration guide in the readme.
|
||||
([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
|
||||
[pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))
|
||||
- [ ] I checked if a related official extension example runs on my machine.
|
||||
72
.github/ISSUE_TEMPLATE/migration.yml
vendored
Normal file
72
.github/ISSUE_TEMPLATE/migration.yml
vendored
Normal file
@ -0,0 +1,72 @@
|
||||
name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
|
||||
description: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
|
||||
labels: [ "migration" ]
|
||||
body:
|
||||
- type: textarea
|
||||
id: system-info
|
||||
attributes:
|
||||
label: System Info
|
||||
description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
|
||||
render: shell
|
||||
placeholder: transformers version, platform, python version, ...
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: checkboxes
|
||||
id: information-scripts-examples
|
||||
attributes:
|
||||
label: Information
|
||||
description: 'The problem arises when using:'
|
||||
options:
|
||||
- label: "The official example scripts"
|
||||
- label: "My own modified scripts"
|
||||
|
||||
- type: checkboxes
|
||||
id: information-tasks
|
||||
attributes:
|
||||
label: Tasks
|
||||
description: "The tasks I am working on are:"
|
||||
options:
|
||||
- label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
|
||||
- label: "My own task or dataset (give details below)"
|
||||
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Reproduction
|
||||
description: |
|
||||
Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
|
||||
If you have code snippets, error messages, stack traces please provide them here as well.
|
||||
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
|
||||
|
||||
placeholder: |
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
|
||||
- type: textarea
|
||||
id: expected-behavior
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Expected behavior
|
||||
description: "A clear and concise description of what you would expect to happen."
|
||||
render: shell
|
||||
|
||||
- type: checkboxes
|
||||
id: checklist
|
||||
attributes:
|
||||
label: Checklist
|
||||
options:
|
||||
- label: "I have read the migration guide in the readme.
|
||||
([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
|
||||
[pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))"
|
||||
required: true
|
||||
- label: "I checked if a related official extension example runs on my machine."
|
||||
required: true
|
||||
31
.github/ISSUE_TEMPLATE/new-model-addition.yml
vendored
Normal file
31
.github/ISSUE_TEMPLATE/new-model-addition.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
name: "\U0001F31F New model addition"
|
||||
description: Submit a proposal/request to implement a new model
|
||||
labels: [ "New model" ]
|
||||
|
||||
body:
|
||||
- type: textarea
|
||||
id: description-request
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Model description
|
||||
description: |
|
||||
Put any and all important information relative to the model
|
||||
|
||||
- type: checkboxes
|
||||
id: information-tasks
|
||||
attributes:
|
||||
label: Open source status
|
||||
description: |
|
||||
Please note that if the model implementation isn't available or if the weights aren't open-source, we are less likely to implement it in `transformers`.
|
||||
options:
|
||||
- label: "The model implementation is available"
|
||||
- label: "The model weights are available"
|
||||
|
||||
- type: textarea
|
||||
id: additional-info
|
||||
attributes:
|
||||
label: Provide useful links for the implementation
|
||||
description: |
|
||||
Please provide information regarding the implementation, the weights, and the authors.
|
||||
Please mention the authors by @gh-username if you're aware of their usernames.
|
||||
26
.github/ISSUE_TEMPLATE/question-help.md
vendored
26
.github/ISSUE_TEMPLATE/question-help.md
vendored
@ -1,26 +0,0 @@
|
||||
---
|
||||
name: "❓ Questions & Help"
|
||||
about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# ❓ Questions & Help
|
||||
|
||||
<!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
|
||||
new models, benchmarks, and migration questions. For all other questions,
|
||||
we direct you to the Hugging Face forum: https://discuss.huggingface.co/ .
|
||||
-->
|
||||
|
||||
## Details
|
||||
|
||||
<!-- Description of your issue -->
|
||||
|
||||
<!-- You should first ask your question on the forum, and only if
|
||||
you didn't get an answer after a few days ask it here on GitHub. -->
|
||||
|
||||
**A link to original question on the forum**:
|
||||
|
||||
<!-- Your issue will be closed if you don't fill this part. -->
|
||||
34
.github/workflows/add-model-like.yml
vendored
34
.github/workflows/add-model-like.yml
vendored
@ -18,32 +18,52 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Loading cache.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt -y update && sudo apt install -y libsndfile1-dev
|
||||
|
||||
- name: Load cached virtual environment
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: v1-tests_model_like-${{ hashFiles('setup.py') }}
|
||||
path: ~/venv/
|
||||
key: v3-tests_model_like-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Install dependencies
|
||||
- name: Create virtual environment on cache miss
|
||||
if: steps.cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m venv ~/venv && . ~/venv/bin/activate
|
||||
pip install --upgrade pip!=21.3
|
||||
pip install -U click # Click 7 is installed in the environment by default, but we need at least version 8 for Black
|
||||
sudo apt -y update && sudo apt install -y libsndfile1-dev
|
||||
pip install .[dev]
|
||||
pip install -e .[dev]
|
||||
|
||||
- name: Check transformers location
|
||||
# make `transformers` available as package (required since we use `-e` flag) and check it's indeed from the repo.
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
python setup.py develop
|
||||
transformer_loc=$(pip show transformers | grep "Location: " | cut -c11-)
|
||||
transformer_repo_loc=$(pwd .)
|
||||
if [ "$transformer_loc" != "$transformer_repo_loc/src" ]; then
|
||||
echo "transformers is from $transformer_loc but it shoud be from $transformer_repo_loc/src."
|
||||
echo "A fix is required. Stop testing."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create model files
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
|
||||
make style
|
||||
make fix-copies
|
||||
|
||||
- name: Run all PyTorch modeling test
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_new_models tests/bert_new/test_modeling_bert_new.py
|
||||
|
||||
- name: Run style changes
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
make style && make quality && make repo-consistency
|
||||
|
||||
- name: Failure short reports
|
||||
|
||||
3
.github/workflows/build-docker-images.yml
vendored
3
.github/workflows/build-docker-images.yml
vendored
@ -41,7 +41,6 @@ jobs:
|
||||
|
||||
latest-torch-deepspeed-docker:
|
||||
name: "Latest PyTorch + DeepSpeed"
|
||||
needs: latest-docker
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
-
|
||||
@ -93,7 +92,6 @@ jobs:
|
||||
latest-pytorch:
|
||||
name: "Latest PyTorch [dev]"
|
||||
runs-on: ubuntu-latest
|
||||
needs: latest-torch-deepspeed-docker
|
||||
steps:
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
@ -118,7 +116,6 @@ jobs:
|
||||
tags: huggingface/transformers-pytorch-gpu
|
||||
|
||||
latest-tensorflow:
|
||||
needs: latest-pytorch
|
||||
name: "Latest TensorFlow [dev]"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
|
||||
2
.github/workflows/build_documentation.yml
vendored
2
.github/workflows/build_documentation.yml
vendored
@ -15,6 +15,6 @@ jobs:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: transformers
|
||||
notebook_folder: transformers_doc
|
||||
languages: en es
|
||||
languages: en es it pt
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
|
||||
2
.github/workflows/build_pr_documentation.yml
vendored
2
.github/workflows/build_pr_documentation.yml
vendored
@ -14,4 +14,4 @@ jobs:
|
||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||
pr_number: ${{ github.event.number }}
|
||||
package: transformers
|
||||
languages: en es
|
||||
languages: en es it pt
|
||||
|
||||
4
.github/workflows/doctests.yml
vendored
4
.github/workflows/doctests.yml
vendored
@ -32,9 +32,7 @@ jobs:
|
||||
|
||||
- name: GPU visibility
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Prepare files for doctests
|
||||
run: |
|
||||
|
||||
46
.github/workflows/github-torch-hub.yml
vendored
46
.github/workflows/github-torch-hub.yml
vendored
@ -1,46 +0,0 @@
|
||||
name: Torch hub integration
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "*"
|
||||
|
||||
jobs:
|
||||
torch_hub_integration:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
# TODO quickfix but may need more investigation
|
||||
ACTIONS_ALLOW_UNSECURE_COMMANDS: True
|
||||
steps:
|
||||
# no checkout necessary here.
|
||||
- name: Extract branch name
|
||||
run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}"
|
||||
- name: Check branch name
|
||||
run: echo $BRANCH
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: 3.7
|
||||
|
||||
- name: Loading cache
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: v0-torch_hub-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
# install torch-hub specific dependencies
|
||||
pip install -e git+https://github.com/huggingface/transformers.git#egg=transformers[torchhub]
|
||||
# no longer needed
|
||||
pip uninstall -y transformers
|
||||
|
||||
#- name: Torch hub list
|
||||
# run: |
|
||||
# python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))"
|
||||
|
||||
#- name: Torch hub help
|
||||
# run: |
|
||||
# python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))"
|
||||
61
.github/workflows/model-templates.yml
vendored
61
.github/workflows/model-templates.yml
vendored
@ -1,43 +1,51 @@
|
||||
name: Model templates runner
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
types: [assigned, opened, synchronize, reopened]
|
||||
repository_dispatch:
|
||||
schedule:
|
||||
- cron: "0 2 * * *"
|
||||
|
||||
jobs:
|
||||
run_tests_templates:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: 3.6
|
||||
|
||||
- name: Loading cache.
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: v1.2-tests_templates-${{ hashFiles('setup.py') }}
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip!=21.3
|
||||
sudo apt -y update && sudo apt install -y libsndfile1-dev
|
||||
pip install .[dev]
|
||||
|
||||
- name: Load cached virtual environment
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: ~/venv/
|
||||
key: v3-tests_templates-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Create virtual environment on cache miss
|
||||
if: steps.cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m venv ~/venv && . ~/venv/bin/activate
|
||||
pip install --upgrade pip!=21.3
|
||||
pip install -e .[dev]
|
||||
|
||||
- name: Check transformers location
|
||||
# make `transformers` available as package (required since we use `-e` flag) and check it's indeed from the repo.
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
python setup.py develop
|
||||
transformer_loc=$(pip show transformers | grep "Location: " | cut -c11-)
|
||||
transformer_repo_loc=$(pwd .)
|
||||
if [ "$transformer_loc" != "$transformer_repo_loc/src" ]; then
|
||||
echo "transformers is from $transformer_loc but it shoud be from $transformer_repo_loc/src."
|
||||
echo "A fix is required. Stop testing."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create model files
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
|
||||
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
|
||||
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
|
||||
@ -53,11 +61,12 @@ jobs:
|
||||
|
||||
- name: Run all non-slow tests
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_templates tests/*template*
|
||||
|
||||
- name: Run style changes
|
||||
run: |
|
||||
git fetch origin main:main
|
||||
. ~/venv/bin/activate
|
||||
make style && make quality && make repo-consistency
|
||||
|
||||
- name: Failure short reports
|
||||
|
||||
24
.github/workflows/self-nightly-scheduled.yml
vendored
24
.github/workflows/self-nightly-scheduled.yml
vendored
@ -41,7 +41,7 @@ jobs:
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
utils/print_env.py
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
@ -49,7 +49,7 @@ jobs:
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_gpu_failures_short.txt
|
||||
run: cat reports/tests_torch_gpu/failures_short.txt
|
||||
|
||||
- name: Run examples tests on GPU
|
||||
if: ${{ always() }}
|
||||
@ -65,7 +65,7 @@ jobs:
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/examples_torch_gpu_failures_short.txt
|
||||
run: cat reports/examples_torch_gpu/failures_short.txt
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
@ -76,7 +76,7 @@ jobs:
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
|
||||
run: cat reports/tests_torch_pipeline_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
@ -109,7 +109,7 @@ jobs:
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
utils/print_env.py
|
||||
|
||||
- name: Run all tests on GPU
|
||||
env:
|
||||
@ -119,7 +119,7 @@ jobs:
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_multi_gpu_failures_short.txt
|
||||
run: cat reports/tests_torch_multi_gpu/failures_short.txt
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
@ -130,7 +130,7 @@ jobs:
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
|
||||
run: cat reports/tests_torch_pipeline_multi_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
@ -157,13 +157,13 @@ jobs:
|
||||
apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
|
||||
pip install --upgrade pip
|
||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
||||
pip install .[testing,deepspeed]
|
||||
pip install .[deepspeed-testing]
|
||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
pip install git+https://github.com/microsoft/DeepSpeed
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
utils/print_env.py
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
@ -171,7 +171,7 @@ jobs:
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
|
||||
run: cat reports/tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
@ -206,7 +206,7 @@ jobs:
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
utils/print_env.py
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
@ -214,7 +214,7 @@ jobs:
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
||||
run: cat reports/tests_torch_cuda_extensions_multi_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
|
||||
29
.github/workflows/self-push-caller.yml
vendored
Normal file
29
.github/workflows/self-push-caller.yml
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
name: Self-hosted runner (push-caller)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
|
||||
jobs:
|
||||
run_push_ci:
|
||||
name: Run Push CI
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
ssh-key: "${{ secrets.COMMIT_KEY }}"
|
||||
|
||||
- name: Checkout to branch push-ci
|
||||
# A more strict way to make sure`push-ci` is exactly the same as `main` at the push event commit.
|
||||
run: |
|
||||
git checkout -b push-ci
|
||||
git push -u origin push-ci --force
|
||||
556
.github/workflows/self-push.yml
vendored
556
.github/workflows/self-push.yml
vendored
@ -3,7 +3,7 @@ name: Self-hosted runner (push)
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- push-ci
|
||||
- ci_*
|
||||
- ci-*
|
||||
paths:
|
||||
@ -20,37 +20,32 @@ env:
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 60
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
run_tests_torch_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||
container:
|
||||
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
setup:
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
||||
apt install -y libsndfile1-dev espeak-ng
|
||||
pip install --upgrade pip
|
||||
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
|
||||
- name: Launcher docker
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
- name: Cleanup
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
rm -rf tests/__pycache__
|
||||
rm -rf tests/models/__pycache__
|
||||
rm -rf reports
|
||||
|
||||
- name: Fetch the tests to run
|
||||
# TODO: add `git-python` in the docker images
|
||||
run: |
|
||||
pip install --upgrade git-python
|
||||
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
|
||||
- name: Report fetched tests
|
||||
@ -59,441 +54,266 @@ jobs:
|
||||
name: test_fetched
|
||||
path: test_preparation.txt
|
||||
|
||||
- name: Run all non-slow tests on GPU
|
||||
- id: set-matrix
|
||||
name: Organize tests into models
|
||||
# The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
|
||||
# The `test_map` is used to get the actual identified test files under each key.
|
||||
# If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
|
||||
run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu $(cat test_list.txt)
|
||||
if [ -f test_map.json ]; then
|
||||
keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
|
||||
test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
|
||||
else
|
||||
keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
|
||||
test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
|
||||
fi
|
||||
echo $keys
|
||||
echo $test_map
|
||||
echo "::set-output name=matrix::$keys"
|
||||
echo "::set-output name=test_map::$test_map"
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_all_tests_torch_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
run_tests_flax_gpu:
|
||||
runs-on: [self-hosted, docker-gpu-test, single-gpu]
|
||||
container:
|
||||
image: tensorflow/tensorflow:2.4.1-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||
pip install --upgrade pip
|
||||
pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
||||
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
continue-on-error: true
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
||||
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
||||
|
||||
- name: Fetch the tests to run
|
||||
run: |
|
||||
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
|
||||
- name: Report fetched tests
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: test_fetched
|
||||
path: test_preparation.txt
|
||||
|
||||
- name: Run all non-slow tests on GPU
|
||||
run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
|
||||
fi
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_flax_gpu_failures_short.txt
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_all_tests_flax_gpu_test_reports
|
||||
path: reports
|
||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
|
||||
# run_tests_tf_gpu:
|
||||
# runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||
# timeout-minutes: 120
|
||||
# container:
|
||||
# image: tensorflow/tensorflow:2.4.1-gpu
|
||||
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
# steps:
|
||||
# - name: Install dependencies
|
||||
# run: |
|
||||
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||
# pip install --upgrade pip
|
||||
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
|
||||
# pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
#
|
||||
# - name: Launcher docker
|
||||
# uses: actions/checkout@v2
|
||||
# with:
|
||||
# fetch-depth: 2
|
||||
#
|
||||
# - name: NVIDIA-SMI
|
||||
# run: |
|
||||
# nvidia-smi
|
||||
#
|
||||
# - name: Are GPUs recognized by our DL frameworks
|
||||
# run: |
|
||||
# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
||||
# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
||||
#
|
||||
# - name: Fetch the tests to run
|
||||
# run: |
|
||||
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
#
|
||||
# - name: Report fetched tests
|
||||
# uses: actions/upload-artifact@v2
|
||||
# with:
|
||||
# name: test_fetched
|
||||
# path: test_preparation.txt
|
||||
#
|
||||
# - name: Run all non-slow tests on GPU
|
||||
# env:
|
||||
# TF_NUM_INTRAOP_THREADS: 8
|
||||
# TF_NUM_INTEROP_THREADS: 1
|
||||
# run: |
|
||||
# if [ -f test_list.txt ]; then
|
||||
# python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu $(cat test_list.txt)
|
||||
# fi
|
||||
#
|
||||
# - name: Failure short reports
|
||||
# if: ${{ failure() }}
|
||||
# run: cat reports/tests_tf_gpu_failures_short.txt
|
||||
#
|
||||
# - name: Test suite reports artifacts
|
||||
# if: ${{ always() }}
|
||||
# uses: actions/upload-artifact@v2
|
||||
# with:
|
||||
# name: run_all_tests_tf_gpu_test_reports
|
||||
# path: reports
|
||||
|
||||
|
||||
run_tests_torch_multi_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
run_tests_multi_gpu:
|
||||
name: Model tests
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||
apt install -y libsndfile1-dev espeak-ng
|
||||
pip install --upgrade pip
|
||||
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
continue-on-error: true
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Fetch the tests to run
|
||||
run: |
|
||||
python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
|
||||
- name: Report fetched tests
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: test_fetched
|
||||
path: test_preparation.txt
|
||||
|
||||
- name: Run all non-slow tests on GPU
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
env:
|
||||
MKL_SERVICE_FORCE_INTEL: 1
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu $(cat test_list.txt)
|
||||
fi
|
||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_multi_gpu_failures_short.txt
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_all_tests_torch_multi_gpu_test_reports
|
||||
path: reports
|
||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
|
||||
# run_tests_flax_multi_gpu:
|
||||
# runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
# container:
|
||||
# image: tensorflow/tensorflow:2.4.1-gpu
|
||||
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
# steps:
|
||||
# - name: Install dependencies
|
||||
# run: |
|
||||
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||
# pip install --upgrade pip
|
||||
# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
||||
# pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
#
|
||||
# - name: Launcher docker
|
||||
# uses: actions/checkout@v2
|
||||
# with:
|
||||
# fetch-depth: 2
|
||||
#
|
||||
# - name: NVIDIA-SMI
|
||||
# continue-on-error: true
|
||||
# run: |
|
||||
# nvidia-smi
|
||||
#
|
||||
# - name: Are GPUs recognized by our DL frameworks
|
||||
# run: |
|
||||
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
||||
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
||||
#
|
||||
# - name: Fetch the tests to run
|
||||
# run: |
|
||||
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
#
|
||||
# - name: Report fetched tests
|
||||
# uses: actions/upload-artifact@v2
|
||||
# with:
|
||||
# name: test_fetched
|
||||
# path: test_preparation.txt
|
||||
#
|
||||
# - name: Run all non-slow tests on GPU
|
||||
# run: |
|
||||
# if [ -f test_list.txt ]; then
|
||||
# python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
|
||||
# fi
|
||||
#
|
||||
# - name: Failure short reports
|
||||
# if: ${{ failure() }}
|
||||
# run: cat reports/tests_flax_multi_gpu_failures_short.txt
|
||||
#
|
||||
# - name: Test suite reports artifacts
|
||||
# if: ${{ always() }}
|
||||
# uses: actions/upload-artifact@v2
|
||||
# with:
|
||||
# name: run_all_tests_flax_multi_gpu_test_reports
|
||||
# path: reports
|
||||
|
||||
# run_tests_tf_multi_gpu:
|
||||
# runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
# timeout-minutes: 120
|
||||
# container:
|
||||
# image: tensorflow/tensorflow:2.4.1-gpu
|
||||
# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
# steps:
|
||||
# - name: Install dependencies
|
||||
# run: |
|
||||
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||
# pip install --upgrade pip
|
||||
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
|
||||
# pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||
#
|
||||
# - name: Launcher docker
|
||||
# uses: actions/checkout@v2
|
||||
# with:
|
||||
# fetch-depth: 2
|
||||
#
|
||||
# - name: NVIDIA-SMI
|
||||
# run: |
|
||||
# nvidia-smi
|
||||
#
|
||||
# - name: Are GPUs recognized by our DL frameworks
|
||||
# run: |
|
||||
# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
||||
# TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
||||
#
|
||||
# - name: Fetch the tests to run
|
||||
# run: |
|
||||
# python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
#
|
||||
# - name: Report fetched tests
|
||||
# uses: actions/upload-artifact@v2
|
||||
# with:
|
||||
# name: test_fetched
|
||||
# path: test_preparation.txt
|
||||
#
|
||||
# - name: Run all non-slow tests on GPU
|
||||
# env:
|
||||
# TF_NUM_INTRAOP_THREADS: 8
|
||||
# TF_NUM_INTEROP_THREADS: 1
|
||||
# run: |
|
||||
# if [ -f test_list.txt ]; then
|
||||
# python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu $(cat test_list.txt)
|
||||
# fi
|
||||
#
|
||||
# - name: Failure short reports
|
||||
# if: ${{ failure() }}
|
||||
# run: cat reports/tests_tf_multi_gpu_failures_short.txt
|
||||
#
|
||||
# - name: Test suite reports artifacts
|
||||
# if: ${{ always() }}
|
||||
# uses: actions/upload-artifact@v2
|
||||
# with:
|
||||
# name: run_all_tests_tf_multi_gpu_test_reports
|
||||
# path: reports
|
||||
|
||||
run_tests_torch_cuda_extensions_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||
run_tests_torch_cuda_extensions_single_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: Update clone
|
||||
working-directory: /workspace/transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again*
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
- name: Environment
|
||||
run: |
|
||||
apt -y update && apt install -y libaio-dev
|
||||
pip install --upgrade pip
|
||||
pip install .[testing,deepspeed]
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
|
||||
- name: Fetch the tests to run
|
||||
run: |
|
||||
python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
|
||||
|
||||
- name: Report fetched tests
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: test_fetched
|
||||
path: test_preparation.txt
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu $(cat test_list.txt)
|
||||
fi
|
||||
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
|
||||
continue-on-error: true
|
||||
run: cat reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_tests_torch_cuda_extensions_gpu_test_reports
|
||||
path: reports
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
|
||||
path: reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
|
||||
|
||||
run_tests_torch_cuda_extensions_multi_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
name: Torch CUDA extension tests
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: Update clone
|
||||
working-directory: /workspace/transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again*
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
continue-on-error: true
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
- name: Environment
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
apt -y update && apt install -y libaio-dev
|
||||
pip install --upgrade pip
|
||||
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
||||
pip install .[testing,deepspeed,fairscale]
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
working-directory: /workspace/transformers
|
||||
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
|
||||
- name: Fetch the tests to run
|
||||
run: |
|
||||
python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
|
||||
|
||||
- name: Report fetched tests
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: test_fetched
|
||||
path: test_preparation.txt
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu $(cat test_list.txt)
|
||||
fi
|
||||
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
||||
continue-on-error: true
|
||||
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
|
||||
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
run_tests_torch_gpu,
|
||||
# run_tests_tf_gpu,
|
||||
run_tests_torch_multi_gpu,
|
||||
# run_tests_tf_multi_gpu,
|
||||
run_tests_torch_cuda_extensions_gpu,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_tests_torch_cuda_extensions_single_gpu,
|
||||
run_tests_torch_cuda_extensions_multi_gpu
|
||||
]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- uses: actions/download-artifact@v2
|
||||
|
||||
- name: Send message to Slack
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_EVENT: push
|
||||
CI_TITLE: ${{ github.event.head_commit.message }}
|
||||
CI_COMMIT_URL: ${{ github.event.head_commit.url }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install slack_sdk
|
||||
python utils/notification_service_deprecated.py push
|
||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||
|
||||
194
.github/workflows/self-scheduled.yml
vendored
194
.github/workflows/self-scheduled.yml
vendored
@ -26,8 +26,8 @@ jobs:
|
||||
name: Setup
|
||||
strategy:
|
||||
matrix:
|
||||
machines: [multi-gpu-docker, single-gpu-docker]
|
||||
runs-on: ${{ matrix.machines }}
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -43,60 +43,124 @@ jobs:
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
rm -rf tests/__pycache__
|
||||
rm -rf tests/models/__pycache__
|
||||
rm -rf reports
|
||||
|
||||
- id: set-matrix
|
||||
name: Identify models to test
|
||||
working-directory: /transformers/tests
|
||||
run: |
|
||||
echo "::set-output name=matrix::$(python3 -c 'import os; x = list(filter(os.path.isdir, os.listdir(os.getcwd()))); x.sort(); print(x)')"
|
||||
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: GPU visibility
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
utils/print_env_pt.py
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
||||
|
||||
run_tests_gpu:
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machines: [multi-gpu-docker, single-gpu-docker]
|
||||
runs-on: ${{ matrix.machines }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
run: echo "${{ matrix.folders }}"
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.machines }}_run_all_tests_gpu_${{ matrix.folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}
|
||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
|
||||
run_tests_multi_gpu:
|
||||
name: Model tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
|
||||
run_examples_gpu:
|
||||
name: Examples directory
|
||||
@ -110,6 +174,15 @@ jobs:
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Run examples tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
@ -133,46 +206,55 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machines: [multi-gpu-docker, single-gpu-docker]
|
||||
runs-on: ${{ matrix.machines }}
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
working-directory: /transformers
|
||||
env:
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
run: |
|
||||
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_torch_pipeline_gpu tests
|
||||
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu/failures_short.txt
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.machines }}_run_tests_torch_pipeline_gpu
|
||||
path: /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
|
||||
|
||||
run_pipelines_tf_gpu:
|
||||
name: TensorFlow pipelines
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machines: [multi-gpu-docker, single-gpu-docker]
|
||||
runs-on: ${{ matrix.machines }}
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
container:
|
||||
image: huggingface/transformers-tensorflow-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
@ -180,32 +262,41 @@ jobs:
|
||||
run: |
|
||||
git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
working-directory: /transformers
|
||||
env:
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
run: |
|
||||
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_tf_pipeline_gpu tests
|
||||
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
cat /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu/failures_short.txt
|
||||
cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.machines }}_run_tests_tf_pipeline_gpu
|
||||
path: /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu
|
||||
name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu
|
||||
|
||||
run_all_tests_torch_cuda_extensions_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machines: [multi-gpu-docker, single-gpu-docker]
|
||||
runs-on: ${{ matrix.machines }}
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
@ -215,37 +306,44 @@ jobs:
|
||||
working-directory: /workspace/transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Re-compile DeepSpeed
|
||||
working-directory: /workspace
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again*
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
pip install deepspeed # installs the deps correctly
|
||||
rm -rf DeepSpeed
|
||||
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python -m pytest -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports
|
||||
path: /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu
|
||||
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
|
||||
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [setup, run_tests_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu]
|
||||
needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/download-artifact@v2
|
||||
@ -255,6 +353,10 @@ jobs:
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_EVENT: scheduled
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install slack_sdk
|
||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||
|
||||
14
.github/workflows/update_metdata.yml
vendored
14
.github/workflows/update_metdata.yml
vendored
@ -16,17 +16,25 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Loading cache.
|
||||
- name: Load cached virtual environment
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: v1-metadata-${{ hashFiles('setup.py') }}
|
||||
path: ~/venv/
|
||||
key: v2-metadata-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Create virtual environment on cache miss
|
||||
if: steps.cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m venv ~/venv && . ~/venv/bin/activate
|
||||
pip install --upgrade pip
|
||||
|
||||
- name: Setup environment
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
pip install git+https://github.com/huggingface/transformers#egg=transformers[dev]
|
||||
|
||||
- name: Update metadata
|
||||
run: |
|
||||
. ~/venv/bin/activate
|
||||
python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }}
|
||||
|
||||
@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https://
|
||||
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
|
||||
for more information.
|
||||
|
||||
#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md)
|
||||
|
||||
**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
|
||||
|
||||
### Develop on Windows
|
||||
|
||||
|
||||
9
Makefile
9
Makefile
@ -9,7 +9,7 @@ modified_only_fixup:
|
||||
$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
|
||||
@if test -n "$(modified_py_files)"; then \
|
||||
echo "Checking/fixing $(modified_py_files)"; \
|
||||
black $(modified_py_files); \
|
||||
black --preview $(modified_py_files); \
|
||||
isort $(modified_py_files); \
|
||||
flake8 $(modified_py_files); \
|
||||
else \
|
||||
@ -39,14 +39,16 @@ repo-consistency:
|
||||
python utils/check_dummies.py
|
||||
python utils/check_repo.py
|
||||
python utils/check_inits.py
|
||||
python utils/check_config_docstrings.py
|
||||
python utils/tests_fetcher.py --sanity_check
|
||||
|
||||
# this target runs checks on all files
|
||||
|
||||
quality:
|
||||
black --check $(check_dirs)
|
||||
black --check --preview $(check_dirs)
|
||||
isort --check-only $(check_dirs)
|
||||
python utils/custom_init_isort.py --check_only
|
||||
python utils/sort_auto_mappings.py --check_only
|
||||
flake8 $(check_dirs)
|
||||
doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
|
||||
|
||||
@ -54,12 +56,13 @@ quality:
|
||||
|
||||
extra_style_checks:
|
||||
python utils/custom_init_isort.py
|
||||
python utils/sort_auto_mappings.py
|
||||
doc-builder style src/transformers docs/source --max_len 119 --path_to_docs docs/source
|
||||
|
||||
# this target runs checks on all files and potentially modifies some of them
|
||||
|
||||
style:
|
||||
black $(check_dirs)
|
||||
black --preview $(check_dirs)
|
||||
isort $(check_dirs)
|
||||
${MAKE} autogenerate_code
|
||||
${MAKE} extra_style_checks
|
||||
|
||||
65
README.md
65
README.md
@ -234,77 +234,86 @@ Current number of checkpoints: ** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
|
||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval
|
||||
for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
|
||||
Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
|
||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/main/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/main/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/main/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
@ -313,34 +322,36 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/main/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER
|
||||
AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/main/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/main/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/main/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/main/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/main/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||
|
||||
To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
|
||||
|
||||
42
README_ko.md
42
README_ko.md
@ -221,6 +221,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
@ -230,6 +231,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
@ -244,11 +246,13 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
@ -256,34 +260,41 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/main/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/main/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/main/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
@ -292,24 +303,26 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/main/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/main/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/main/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/main/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/main/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
@ -318,7 +331,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/main/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다.
|
||||
|
||||
각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요.
|
||||
@ -352,4 +366,4 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
|
||||
pages = "38--45"
|
||||
}
|
||||
```
|
||||
```
|
||||
@ -227,7 +227,7 @@ conda install -c huggingface transformers
|
||||
|
||||
## 模型架构
|
||||
|
||||
**🤗 Transformers 支持的[所有的模型检查点](https://huggingface.co/models)** 由[用户](https://huggingface.co/users)和[组织](https://huggingface.co/organizations)上传,均与 huggingface.co [model hub](https://huggingface.co) 无缝整合。
|
||||
🤗 Transformers 支持的[**所有的模型检查点**](https://huggingface.co/models)由[用户](https://huggingface.co/users)和[组织](https://huggingface.co/organizations)上传,均与 huggingface.co [model hub](https://huggingface.co) 无缝整合。
|
||||
|
||||
目前的检查点数量: 
|
||||
|
||||
@ -245,6 +245,7 @@ conda install -c huggingface transformers
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
|
||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
|
||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
|
||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
|
||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
|
||||
@ -254,6 +255,7 @@ conda install -c huggingface transformers
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
|
||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
|
||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
|
||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
|
||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
|
||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
|
||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
|
||||
@ -268,11 +270,13 @@ conda install -c huggingface transformers
|
||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
|
||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
|
||||
@ -280,34 +284,41 @@ conda install -c huggingface transformers
|
||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
|
||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
|
||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
|
||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
|
||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
|
||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (来自 Facebook) 伴随论文 [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) 由 Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert 发布。
|
||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
|
||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
|
||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
|
||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
|
||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
|
||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
|
||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (来自 CMU/Google Brain) 伴随论文 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 由 Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 发布。
|
||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
|
||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/main/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/main/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/main/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
|
||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
|
||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
|
||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (来自 Facebook) 伴随论文 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 由 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 发布。
|
||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
|
||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
|
||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
|
||||
@ -316,24 +327,26 @@ conda install -c huggingface transformers
|
||||
1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
|
||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/main/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
|
||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
|
||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
|
||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
|
||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/main/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/main/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/main/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
|
||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
|
||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
|
||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/main/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
|
||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
|
||||
@ -342,7 +355,8 @@ conda install -c huggingface transformers
|
||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
|
||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/main/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (来自 Huazhong University of Science & Technology) 伴随论文 [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) 由 Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu 发布。
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
|
||||
1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
|
||||
|
||||
要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现,或其是否在 🤗 Tokenizers 库中有对应词符化器(tokenizer),敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
||||
@ -377,4 +391,4 @@ conda install -c huggingface transformers
|
||||
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
|
||||
pages = "38--45"
|
||||
}
|
||||
```
|
||||
```
|
||||
@ -257,6 +257,7 @@ conda install -c huggingface transformers
|
||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLOOM](https://huggingface.co/docs/transformers/main/model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
@ -266,6 +267,7 @@ conda install -c huggingface transformers
|
||||
1. **[ConvNeXT](https://huggingface.co/docs/transformers/main/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](https://huggingface.co/docs/transformers/main/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](https://huggingface.co/docs/transformers/main/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
@ -280,11 +282,13 @@ conda install -c huggingface transformers
|
||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](https://huggingface.co/docs/transformers/main/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GLPN](https://huggingface.co/docs/transformers/main/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](https://huggingface.co/docs/transformers/main/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
@ -292,34 +296,41 @@ conda install -c huggingface transformers
|
||||
1. **[ImageGPT](https://huggingface.co/docs/transformers/main/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](https://huggingface.co/docs/transformers/main/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](https://huggingface.co/docs/transformers/main/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
|
||||
1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](https://huggingface.co/docs/transformers/main/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](https://huggingface.co/docs/transformers/main/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MaskFormer](https://huggingface.co/docs/transformers/main/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
|
||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/main/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/main/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/main/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RegNet](https://huggingface.co/docs/transformers/main/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
|
||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/main/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
@ -328,24 +339,26 @@ conda install -c huggingface transformers
|
||||
1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/main/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](https://huggingface.co/docs/transformers/main/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/main/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/main/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/main/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/main/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/main/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/main/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
@ -354,7 +367,8 @@ conda install -c huggingface transformers
|
||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/main/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. **[YOLOS](https://huggingface.co/docs/transformers/main/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
1. 想要貢獻新的模型?我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
|
||||
|
||||
要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作,或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer,敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
||||
@ -389,4 +403,4 @@ conda install -c huggingface transformers
|
||||
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
|
||||
pages = "38--45"
|
||||
}
|
||||
```
|
||||
```
|
||||
@ -3,20 +3,36 @@ LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||
# to be used as arguments for docker build (so far).
|
||||
|
||||
ARG PYTORCH='1.11.0'
|
||||
# (not always a valid torch version)
|
||||
ARG INTEL_TORCH_EXT='1.11.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu113'
|
||||
|
||||
RUN apt update
|
||||
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
|
||||
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
|
||||
RUN git lfs install
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||
|
||||
ARG REF=main
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir -U torch tensorflow
|
||||
RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
RUN python3 -m pip install --no-cache-dir -U tensorflow
|
||||
RUN python3 -m pip uninstall -y flax jax
|
||||
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu102.html
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$PYTORCH+$CUDA.html
|
||||
RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
|
||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
RUN cd transformers && python3 setup.py develop
|
||||
|
||||
@ -15,5 +15,6 @@ RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/fac
|
||||
RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
|
||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||
|
||||
RUN doc-builder build transformers transformers/docs/source --build_dir doc-build-dev --notebook_dir notebooks/transformers_doc --clean --version pr_$PR_NUMBER
|
||||
# Test if the image could successfully build the doc. before publishing the image
|
||||
RUN doc-builder build transformers transformers/docs/source/en --build_dir doc-build-dev --notebook_dir notebooks/transformers_doc --clean
|
||||
RUN rm -rf doc-build-dev
|
||||
@ -9,10 +9,17 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||
|
||||
ARG REF=main
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[testing,deepspeed]
|
||||
|
||||
RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||
# Install latest release PyTorch
|
||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||
RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
||||
|
||||
# Pre-build DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
||||
RUN python3 -m pip uninstall -y deepspeed
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
|
||||
@ -13,11 +13,16 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing]
|
||||
|
||||
# If set to nothing, will install the latest version
|
||||
ARG PYTORCH=''
|
||||
ARG TORCH_VISION=''
|
||||
ARG TORCH_AUDIO=''
|
||||
|
||||
RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/cu113
|
||||
RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/cu113
|
||||
RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/cu113
|
||||
|
||||
RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION
|
||||
RUN python3 -m pip uninstall -y tensorflow flax
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu102.html
|
||||
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu113.html
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
|
||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||
|
||||
|
||||
@ -108,6 +108,11 @@ Make sure to put your new file under the proper section. It's unlikely to go in
|
||||
depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
|
||||
four.
|
||||
|
||||
### Translating
|
||||
|
||||
When translating, refer to the guide at [./TRANSLATING.md](https://github.com/huggingface/transformers/blob/main/docs/TRANSLATING.md).
|
||||
|
||||
|
||||
### Adding a new model
|
||||
|
||||
When adding a new model:
|
||||
@ -402,4 +407,4 @@ Here are a few tips to help you debug the doctests and make them pass:
|
||||
* whitespace: one give whitespace (space, tabulation, new line) is equivalent to any number of whitespace, so you can add new lines where there are spaces to make your output more readable.
|
||||
* numerical values: you should never put more than 4 or 5 digits to expected results as different setups or library versions might get you slightly different results. `doctest` is configure to ignore any difference lower than the precision to which you wrote (so 1e-4 if you write 4 digits).
|
||||
- Don't leave a block of code that is very long to execute. If you can't make it fast, you can either not use the doctest syntax on it (so that it's ignored), or if you want to use the doctest syntax to show the results, you can add a comment `# doctest: +SKIP` at the end of the lines of code too long to execute
|
||||
- Each line of code that produces a result needs to have that result written below. You can ignore an output if you don't want to show it in your code example by adding a comment ` # doctest: +IGNORE_RESULT` at the end of the line of code produing it.
|
||||
- Each line of code that produces a result needs to have that result written below. You can ignore an output if you don't want to show it in your code example by adding a comment ` # doctest: +IGNORE_RESULT` at the end of the line of code producing it.
|
||||
|
||||
58
docs/TRANSLATING.md
Normal file
58
docs/TRANSLATING.md
Normal file
@ -0,0 +1,58 @@
|
||||
### Translating the Transformers documentation into your language
|
||||
|
||||
As part of our mission to democratize machine learning, we'd love to make the Transformers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏.
|
||||
|
||||
**🗞️ Open an issue**
|
||||
|
||||
To get started, navigate to the [Issues](https://github.com/huggingface/transformers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button.
|
||||
|
||||
Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list.
|
||||
|
||||
|
||||
**🍴 Fork the repository**
|
||||
|
||||
First, you'll need to [fork the Transformers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
|
||||
|
||||
Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/YOUR-USERNAME/transformers.git
|
||||
```
|
||||
|
||||
**📋 Copy-paste the English version with a new language code**
|
||||
|
||||
The documentation files are in one leading directory:
|
||||
|
||||
- [`docs/source`](https://github.com/huggingface/transformers/tree/main/docs/source): All the documentation materials are organized here by language.
|
||||
|
||||
You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/transformers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following:
|
||||
|
||||
```bash
|
||||
cd ~/path/to/transformers/docs
|
||||
cp -r source/en source/LANG-ID
|
||||
```
|
||||
|
||||
Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
|
||||
|
||||
**✍️ Start translating**
|
||||
|
||||
The fun part comes - translating the text!
|
||||
|
||||
The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website.
|
||||
|
||||
> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory!
|
||||
|
||||
The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml):
|
||||
|
||||
```yaml
|
||||
- sections:
|
||||
- local: pipeline_tutorial # Do not change this! Use the same name for your .md file
|
||||
title: Pipelines for inference # Translate this!
|
||||
...
|
||||
title: Tutorials # Translate this!
|
||||
```
|
||||
|
||||
Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.
|
||||
|
||||
> 🙋 If you'd like others to help you with the translation, you can either [open an issue](https://github.com/huggingface/transformers/issues) or tag @[espejelomar](https://twitter.com/espejelomar)
|
||||
on Twitter to gain some visibility.
|
||||
@ -1,4 +1,4 @@
|
||||
- sections:
|
||||
- sections:
|
||||
- local: index
|
||||
title: 🤗 Transformers
|
||||
- local: quicktour
|
||||
@ -60,9 +60,9 @@
|
||||
- local: serialization
|
||||
title: Export 🤗 Transformers models
|
||||
- local: performance
|
||||
title: 'Performance and Scalability: How To Fit a Bigger Model and Train It Faster'
|
||||
- local: parallelism
|
||||
title: Model Parallelism
|
||||
title: Performance and scalability
|
||||
- local: big_models
|
||||
title: Instantiating a big model
|
||||
- local: benchmarks
|
||||
title: Benchmarks
|
||||
- local: migration
|
||||
@ -81,6 +81,16 @@
|
||||
title: "How to add a model to 🤗 Transformers?"
|
||||
- local: add_new_pipeline
|
||||
title: "How to add a pipeline to 🤗 Transformers?"
|
||||
- local: perf_train_gpu_one
|
||||
title: Training on one GPU
|
||||
- local: perf_train_gpu_many
|
||||
title: Training on many GPUs
|
||||
- local: perf_train_cpu
|
||||
title: Training on CPU
|
||||
- local: perf_infer_cpu
|
||||
title: Inference on CPU
|
||||
- local: perf_hardware
|
||||
title: Custom hardware for training
|
||||
- local: testing
|
||||
title: Testing
|
||||
- local: pr_checks
|
||||
@ -168,6 +178,8 @@
|
||||
title: Blenderbot
|
||||
- local: model_doc/blenderbot-small
|
||||
title: Blenderbot Small
|
||||
- local: model_doc/bloom
|
||||
title: BLOOM
|
||||
- local: model_doc/bort
|
||||
title: BORT
|
||||
- local: model_doc/byt5
|
||||
@ -186,6 +198,8 @@
|
||||
title: CPM
|
||||
- local: model_doc/ctrl
|
||||
title: CTRL
|
||||
- local: model_doc/cvt
|
||||
title: CvT
|
||||
- local: model_doc/data2vec
|
||||
title: Data2Vec
|
||||
- local: model_doc/deberta
|
||||
@ -214,6 +228,8 @@
|
||||
title: Encoder Decoder Models
|
||||
- local: model_doc/flaubert
|
||||
title: FlauBERT
|
||||
- local: model_doc/flava
|
||||
title: FLAVA
|
||||
- local: model_doc/fnet
|
||||
title: FNet
|
||||
- local: model_doc/fsmt
|
||||
@ -232,12 +248,18 @@
|
||||
title: LayoutLM
|
||||
- local: model_doc/layoutlmv2
|
||||
title: LayoutLMV2
|
||||
- local: model_doc/layoutlmv3
|
||||
title: LayoutLMV3
|
||||
- local: model_doc/layoutxlm
|
||||
title: LayoutXLM
|
||||
- local: model_doc/led
|
||||
title: LED
|
||||
- local: model_doc/levit
|
||||
title: LeViT
|
||||
- local: model_doc/longformer
|
||||
title: Longformer
|
||||
- local: model_doc/longt5
|
||||
title: LongT5
|
||||
- local: model_doc/luke
|
||||
title: LUKE
|
||||
- local: model_doc/lxmert
|
||||
@ -250,6 +272,8 @@
|
||||
title: M2M100
|
||||
- local: model_doc/mbart
|
||||
title: MBart and MBart-50
|
||||
- local: model_doc/mctct
|
||||
title: MCTCT
|
||||
- local: model_doc/megatron-bert
|
||||
title: MegatronBERT
|
||||
- local: model_doc/megatron_gpt2
|
||||
@ -266,12 +290,16 @@
|
||||
title: Nyströmformer
|
||||
- local: model_doc/openai-gpt
|
||||
title: OpenAI GPT
|
||||
- local: model_doc/opt
|
||||
title: OPT
|
||||
- local: model_doc/gpt2
|
||||
title: OpenAI GPT2
|
||||
- local: model_doc/gptj
|
||||
title: GPT-J
|
||||
- local: model_doc/gpt_neo
|
||||
title: GPT Neo
|
||||
- local: model_doc/gpt_neox
|
||||
title: GPT NeoX
|
||||
- local: model_doc/hubert
|
||||
title: Hubert
|
||||
- local: model_doc/perceiver
|
||||
@ -332,6 +360,8 @@
|
||||
title: TAPAS
|
||||
- local: model_doc/tapex
|
||||
title: TAPEX
|
||||
- local: model_doc/trajectory_transformer
|
||||
title: Trajectory Transformer
|
||||
- local: model_doc/transfo-xl
|
||||
title: Transformer XL
|
||||
- local: model_doc/trocr
|
||||
@ -356,6 +386,8 @@
|
||||
title: VisualBERT
|
||||
- local: model_doc/wav2vec2
|
||||
title: Wav2Vec2
|
||||
- local: model_doc/wav2vec2-conformer
|
||||
title: Wav2Vec2-Conformer
|
||||
- local: model_doc/wav2vec2_phoneme
|
||||
title: Wav2Vec2Phoneme
|
||||
- local: model_doc/wavlm
|
||||
@ -376,6 +408,8 @@
|
||||
title: XLSR-Wav2Vec2
|
||||
- local: model_doc/xls_r
|
||||
title: XLS-R
|
||||
- local: model_doc/yolos
|
||||
title: YOLOS
|
||||
- local: model_doc/yoso
|
||||
title: YOSO
|
||||
title: Models
|
||||
|
||||
128
docs/source/en/big_models.mdx
Normal file
128
docs/source/en/big_models.mdx
Normal file
@ -0,0 +1,128 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Instantiating a big model
|
||||
|
||||
When you want to use a very big pretrained model, one challenge is to minimize the use of the RAM. The usual workflow
|
||||
from PyTorch is:
|
||||
|
||||
1. Create your model with random weights.
|
||||
2. Load your pretrained weights.
|
||||
3. Put those pretrained weights in your random model.
|
||||
|
||||
Step 1 and 2 both require a full version of the model in memory, which is not a problem in most cases, but if your model starts weighing several GigaBytes, those two copies can make you got our of RAM. Even worse, if you are using `torch.distributed` to launch a distributed training, each process will load the pretrained model and store these two copies in RAM.
|
||||
|
||||
<Tip>
|
||||
|
||||
Note that the randomly created model is initialized with "empty" tensors, which take the space in memory without filling it (thus the random values are whatever was in this chunk of memory at a given time). The random initialization following the appropriate distribution for the kind of model/parameters instatiated (like a normal distribution for instance) is only performed after step 3 on the non-initialized weights, to be as fast as possible!
|
||||
|
||||
</Tip>
|
||||
|
||||
In this guide, we explore the solutions Transformers offer to deal with this issue. Note that this is an area of active development, so the APIs explained here may change slightly in the future.
|
||||
|
||||
## Sharded checkpoints
|
||||
|
||||
Since version 4.18.0, model checkpoints that end up taking more than 10GB of space are automatically sharded in smaller pieces. In terms of having one single checkpoint when you do `model.save_pretrained(save_dir)`, you will end up with several partial checkpoints (each of which being of size < 10GB) and an index that maps parameter names to the files they are stored in.
|
||||
|
||||
You can control the maximum size before sharding with the `max_shard_size` parameter, so for the sake of an example, we'll use a normal-size models with a small shard size: let's take a traditional BERT model.
|
||||
|
||||
```py
|
||||
from transformers import AutoModel
|
||||
|
||||
model = AutoModel.from_pretrained("bert-base-cased")
|
||||
```
|
||||
|
||||
If you save it using [`~PreTrainedModel.save_pretrained`], you will get a new folder with two files: the config of the model and its weights:
|
||||
|
||||
```py
|
||||
>>> import os
|
||||
>>> import tempfile
|
||||
|
||||
>>> with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
... model.save_pretrained(tmp_dir)
|
||||
... print(sorted(os.listdir(tmp_dir)))
|
||||
['config.json', 'pytorch_model.bin']
|
||||
```
|
||||
|
||||
Now let's use a maximum shard size of 200MB:
|
||||
|
||||
```py
|
||||
>>> with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
... model.save_pretrained(tmp_dir, max_shard_size="200MB")
|
||||
... print(sorted(os.listdir(tmp_dir)))
|
||||
['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json']
|
||||
```
|
||||
|
||||
On top of the configuration of the model, we see three different weights files, and an `index.json` file which is our index. A checkpoint like this can be fully reloaded using the [`~PreTrainedModel.from_pretrained`] method:
|
||||
|
||||
```py
|
||||
>>> with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
... model.save_pretrained(tmp_dir, max_shard_size="200MB")
|
||||
... new_model = AutoModel.from_pretrained(tmp_dir)
|
||||
```
|
||||
|
||||
The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard.
|
||||
|
||||
Beind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. We can load that index like any json and get a dictionary:
|
||||
|
||||
```py
|
||||
>>> import json
|
||||
|
||||
>>> with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
... model.save_pretrained(tmp_dir, max_shard_size="200MB")
|
||||
... with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f:
|
||||
... index = json.load(f)
|
||||
|
||||
>>> print(index.keys())
|
||||
dict_keys(['metadata', 'weight_map'])
|
||||
```
|
||||
|
||||
The metadata just consists of the total size of the model for now. We plan to add several other informations in the future:
|
||||
|
||||
```py
|
||||
>>> index["metadata"]
|
||||
{'total_size': 433245184}
|
||||
```
|
||||
|
||||
The weights map is the main part of this index, which maps each parameter name (as usually found in a PyTorch model `state_dict`) to the file it's stored in:
|
||||
|
||||
```py
|
||||
>>> index["weight_map"]
|
||||
{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin',
|
||||
'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin',
|
||||
...
|
||||
```
|
||||
|
||||
If you want to directly load such a sharded checkpoint inside a model without using [`~PreTrainedModel.from_pretrained`] (like you would do `model.load_state_dict()` for a full checkpoint) you should use [`~modeling_utils.load_sharded_checkpoint`]:
|
||||
|
||||
```py
|
||||
>>> from transformers.modeling_utils import load_sharded_checkpoint
|
||||
|
||||
>>> with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
... model.save_pretrained(tmp_dir, max_shard_size="200MB")
|
||||
... load_sharded_checkpoint(model, tmp_dir)
|
||||
```
|
||||
|
||||
## Low memory loading
|
||||
|
||||
Sharded checkpoints reduce the memory usage during step 2 of the worflow mentioned above, but when loadin a pretrained model, why keep the random weights in memory? The option `low_cpu_mem_usage` will destroy the weights of the randomly initialized model, then progressively load the weights inside, then perform a random initialization for potential missing weights (if you are loadding a model with a newly initialized head for a fine-tuning task for instance).
|
||||
|
||||
It's very easy to use, just add `low_cpu_mem_usage=True` to your call to [`~PreTrainedModel.from_pretrained`]:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForSequenceClas
|
||||
|
||||
model = AutoModel.from_pretrained("bert-base-cased", low_cpu_mem_usage=True)
|
||||
```
|
||||
|
||||
This can be used in conjunction with a sharded checkpoint.
|
||||
|
||||
@ -106,7 +106,7 @@ directly upload your config to the Hub.
|
||||
|
||||
Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that
|
||||
extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image
|
||||
classification (like [`BertModelForSequenceClassification`]).
|
||||
classification (like [`BertForSequenceClassification`]).
|
||||
|
||||
As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only
|
||||
thing we need to do before writing this class is a map between the block types and actual block classes. Then the
|
||||
|
||||
@ -28,7 +28,7 @@ Each 🤗 Transformers architecture is defined in a standalone Python module so
|
||||
## If you are looking for custom support from the Hugging Face team
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/support">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
</a><br>
|
||||
|
||||
## Contents
|
||||
@ -57,63 +57,73 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
|
||||
1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||
1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||
1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||
1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigSicence Workshop](https://bigscience.huggingface.co/).
|
||||
1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||
1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||
1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
|
||||
1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
||||
1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||
1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||
1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
|
||||
1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||
1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
|
||||
1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
|
||||
1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
|
||||
1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||
1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||
1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
|
||||
1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
1. **[LayoutXLM](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
|
||||
1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
|
||||
1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||
1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
|
||||
1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||
1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MBart](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[MBart-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||
1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||
1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
|
||||
1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
|
||||
1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||
1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||
1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||
@ -121,10 +131,11 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
|
||||
1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||
1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||
1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
|
||||
1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||
1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
|
||||
1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||
@ -134,12 +145,13 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
|
||||
1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBert](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
|
||||
1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
@ -147,19 +159,21 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
|
||||
1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||
1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||
1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
|
||||
|
||||
@ -179,19 +193,22 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| BERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| BigBird-Pegasus | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| BLOOM | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| Canine | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| CANINE | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| ConvNeXT | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CvT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecVision | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| DeBERTa-v2 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DETR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
@ -202,31 +219,38 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| FNet | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| LayoutLMv3 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| LED | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| LeViT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| LongT5 | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| M-CTC-T | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Marian | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||
| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| mBART | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Megatron-BERT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Nystromformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| MT5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Nyströmformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| OPT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
@ -234,7 +258,7 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| RAG | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Realm | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| REALM | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RegNet | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
@ -250,10 +274,10 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| Swin | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Swin Transformer | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| T5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| TAPEX | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
@ -262,17 +286,19 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| VisualBERT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ViT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||
| Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| XLM-ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
|
||||
<!-- End table-->
|
||||
|
||||
@ -127,6 +127,9 @@ generation.
|
||||
[[autodoc]] TopKLogitsWarper
|
||||
- __call__
|
||||
|
||||
[[autodoc]] TypicalLogitsWarper
|
||||
- __call__
|
||||
|
||||
[[autodoc]] NoRepeatNGramLogitsProcessor
|
||||
- __call__
|
||||
|
||||
|
||||
@ -22,6 +22,8 @@ Most of those are only useful if you are studying the code of the Trainer in the
|
||||
|
||||
[[autodoc]] IntervalStrategy
|
||||
|
||||
[[autodoc]] enable_full_determinism
|
||||
|
||||
[[autodoc]] set_seed
|
||||
|
||||
[[autodoc]] torch_distributed_zero_first
|
||||
|
||||
@ -40,29 +40,17 @@ Additionally, some `warnings` can be disabled by setting the environment variabl
|
||||
TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
|
||||
```
|
||||
|
||||
Here is an example of how to use `logging` in a module:
|
||||
Here is an example of how to use the same logger as the library in your own module or script:
|
||||
|
||||
```python
|
||||
from transformers.utils import logging
|
||||
|
||||
logging.set_verbosity_info()
|
||||
logger = logging.get_logger(__name__)
|
||||
logger = logging.get_logger("transformers")
|
||||
logger.info("INFO")
|
||||
logger.warning("WARN")
|
||||
```
|
||||
|
||||
Above, a `logger` instance is created from `logging.get_logger(__name__)`. If you want to use `logging` in a script, you shouldn't pass `__name__` to `logging.get_logger`. For example:
|
||||
|
||||
```python
|
||||
from transformers.utils import logging
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.set_verbosity_info()
|
||||
# leave it empy or use a string
|
||||
logger = logging.get_logger()
|
||||
logger.info("INFO")
|
||||
logger.warning("WARN")
|
||||
```
|
||||
|
||||
All the methods of this logging module are documented below, the main ones are
|
||||
[`logging.get_verbosity`] to get the current level of verbosity in the logger and
|
||||
|
||||
@ -38,6 +38,75 @@ for text generation, [`~generation_utils.GenerationMixin`] (for the PyTorch mode
|
||||
|
||||
<a id='from_pretrained-torch-dtype'></a>
|
||||
|
||||
### Large model loading
|
||||
|
||||
In Transformers 4.20.0, the [`~PreTrainedModel.from_pretrained`] method has been reworked to accommodate large models using [Accelerate](https://huggingface.co/docs/accelerate/big_modeling). This requires Accelerate >= 0.9.0 and PyTorch >= 1.9.0. Instead of creating the full model, then loading the pretrained weights inside it (which takes twice the size of the model in RAM, one for the randomly initialized model, one for the weights), there is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded.
|
||||
|
||||
This option can be activated with `low_cpu_mem_usage=True`. The model is first created on the Meta device (with empty weights) and the state dict is then loaded inside it (shard by shard in the case of a sharded checkpoint). This way the maximum RAM used is the full size of the model only.
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
|
||||
```
|
||||
|
||||
Moreover, you can directly place the model on different devices if it doesn't fully fit in RAM (only works for inference for now). With `device_map="auto"`, Accelerate will determine where to put each layer to maximize the use of your fastest devices (GPUs) and offload the rest on the CPU, or even the hard drive if you don't have enough GPU RAM (or CPU RAM). Even if the model is split across several devices, it will run as you would normally expect.
|
||||
|
||||
When passing a `device_map`, `low_cpu_mem_usage` is automatically set to `True`, so you don't need to specify it:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto")
|
||||
```
|
||||
|
||||
You can inspect how the model was split across devices by looking at its `hf_device_map` attribute:
|
||||
|
||||
```py
|
||||
t0pp.hf_device_map
|
||||
```
|
||||
|
||||
```python out
|
||||
{'shared': 0,
|
||||
'decoder.embed_tokens': 0,
|
||||
'encoder': 0,
|
||||
'decoder.block.0': 0,
|
||||
'decoder.block.1': 1,
|
||||
'decoder.block.2': 1,
|
||||
'decoder.block.3': 1,
|
||||
'decoder.block.4': 1,
|
||||
'decoder.block.5': 1,
|
||||
'decoder.block.6': 1,
|
||||
'decoder.block.7': 1,
|
||||
'decoder.block.8': 1,
|
||||
'decoder.block.9': 1,
|
||||
'decoder.block.10': 1,
|
||||
'decoder.block.11': 1,
|
||||
'decoder.block.12': 1,
|
||||
'decoder.block.13': 1,
|
||||
'decoder.block.14': 1,
|
||||
'decoder.block.15': 1,
|
||||
'decoder.block.16': 1,
|
||||
'decoder.block.17': 1,
|
||||
'decoder.block.18': 1,
|
||||
'decoder.block.19': 1,
|
||||
'decoder.block.20': 1,
|
||||
'decoder.block.21': 1,
|
||||
'decoder.block.22': 'cpu',
|
||||
'decoder.block.23': 'cpu',
|
||||
'decoder.final_layer_norm': 'cpu',
|
||||
'decoder.dropout': 'cpu',
|
||||
'lm_head': 'cpu'}
|
||||
```
|
||||
|
||||
You can also write your own device map following the same format (a dictionary layer name to device). It should map all parameters of the model to a given device, but you don't have to detail where all the submosules of one layer go if that layer is entirely on the same device. For instance, the following device map would work properly for T0pp (as long as you have the GPU memory):
|
||||
|
||||
```python
|
||||
device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1}
|
||||
```
|
||||
|
||||
Another way to minimize the memory impact of your model is to instantiate it at a lower precision dtype (like `torch.float16`).
|
||||
|
||||
### Model Instantiation dtype
|
||||
|
||||
Under Pytorch a model normally gets instantiated with `torch.float32` format. This can be an issue if one tries to
|
||||
@ -89,3 +158,7 @@ Due to Pytorch design, this functionality is only available for floating dtypes.
|
||||
## Pushing to the Hub
|
||||
|
||||
[[autodoc]] utils.PushToHubMixin
|
||||
|
||||
## Sharded checkpoints
|
||||
|
||||
[[autodoc]] modeling_utils.load_sharded_checkpoint
|
||||
|
||||
@ -136,6 +136,30 @@ documented on their corresponding model page.
|
||||
|
||||
[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
|
||||
|
||||
## SemanticSegmenterOutput
|
||||
|
||||
[[autodoc]] modeling_outputs.SemanticSegmenterOutput
|
||||
|
||||
## ImageClassifierOutput
|
||||
|
||||
[[autodoc]] modeling_outputs.ImageClassifierOutput
|
||||
|
||||
## ImageClassifierOutputWithNoAttention
|
||||
|
||||
[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention
|
||||
|
||||
## DepthEstimatorOutput
|
||||
|
||||
[[autodoc]] modeling_outputs.DepthEstimatorOutput
|
||||
|
||||
## Wav2Vec2BaseModelOutput
|
||||
|
||||
[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput
|
||||
|
||||
## XVectorOutput
|
||||
|
||||
[[autodoc]] modeling_outputs.XVectorOutput
|
||||
|
||||
## TFBaseModelOutput
|
||||
|
||||
[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
|
||||
|
||||
@ -38,6 +38,7 @@ There are two categories of pipeline abstractions to be aware about:
|
||||
- [`Text2TextGenerationPipeline`]
|
||||
- [`TokenClassificationPipeline`]
|
||||
- [`TranslationPipeline`]
|
||||
- [`VisualQuestionAnsweringPipeline`]
|
||||
- [`ZeroShotClassificationPipeline`]
|
||||
- [`ZeroShotImageClassificationPipeline`]
|
||||
|
||||
@ -423,6 +424,12 @@ See [`TokenClassificationPipeline`] for all details.
|
||||
- __call__
|
||||
- all
|
||||
|
||||
### VisualQuestionAnsweringPipeline
|
||||
|
||||
[[autodoc]] VisualQuestionAnsweringPipeline
|
||||
- __call__
|
||||
- all
|
||||
|
||||
### ZeroShotClassificationPipeline
|
||||
|
||||
[[autodoc]] ZeroShotClassificationPipeline
|
||||
|
||||
@ -18,9 +18,7 @@ Rust library [🤗 Tokenizers](https://github.com/huggingface/tokenizers). The "
|
||||
|
||||
1. a significant speed-up in particular when doing batched tokenization and
|
||||
2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
|
||||
index of the token comprising a given character or the span of characters corresponding to a given token). Currently
|
||||
no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLM-RoBERTa
|
||||
and XLNet models).
|
||||
index of the token comprising a given character or the span of characters corresponding to a given token).
|
||||
|
||||
The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]
|
||||
implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
|
||||
|
||||
@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Trainer
|
||||
|
||||
The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](../examples).
|
||||
The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](https://github.com/huggingface/transformers/tree/main/examples).
|
||||
|
||||
Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.
|
||||
|
||||
@ -291,10 +291,10 @@ Also if you do set this environment variable it's the best to set it in your `~/
|
||||
The [`Trainer`] has been extended to support libraries that may dramatically improve your training
|
||||
time and fit much bigger models.
|
||||
|
||||
Currently it supports third party solutions, [DeepSpeed](https://github.com/microsoft/DeepSpeed) and [FairScale](https://github.com/facebookresearch/fairscale/), which implement parts of the paper [ZeRO: Memory Optimizations
|
||||
Currently it supports third party solutions, [DeepSpeed](https://github.com/microsoft/DeepSpeed), [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) and [FairScale](https://github.com/facebookresearch/fairscale/), which implement parts of the paper [ZeRO: Memory Optimizations
|
||||
Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054).
|
||||
|
||||
This provided support is new and experimental as of this writing.
|
||||
This provided support is new and experimental as of this writing. While the support for DeepSpeed and PyTorch FSDP is active and we welcome issues around it, we don't support the FairScale integration anymore since it has been integrated in PyTorch main (see the [PyTorch FSDP integration](#pytorch-fully-sharded-data-parallel))
|
||||
|
||||
<a id='zero-install-notes'></a>
|
||||
|
||||
@ -408,6 +408,12 @@ As always make sure to edit the paths in the example to match your situation.
|
||||
|
||||
### FairScale
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This integration is not supported anymore, we recommend you either use DeepSpeed or PyTorch FSDP.
|
||||
|
||||
</Tip>
|
||||
|
||||
By integrating [FairScale](https://github.com/facebookresearch/fairscale/) the [`Trainer`]
|
||||
provides support for the following features from [the ZeRO paper](https://arxiv.org/abs/1910.02054):
|
||||
|
||||
@ -540,6 +546,42 @@ Known caveats:
|
||||
`FullyShardedDataParallelism` of fairscale. It should be used with the option `auto_wrap` if you are not
|
||||
doing this yourself: `--sharded_ddp "zero_dp_3 auto_wrap"`.
|
||||
|
||||
### PyTorch Fully Sharded Data parallel
|
||||
|
||||
To accelerate training huge models on larger batch sizes, we can use a fully sharded data parallel model.
|
||||
This type of data parallel paradigm enables fitting more data and larger models by sharding the optimizer states, gradients and parameters.
|
||||
To read more about it and the benefits, check out the [Fully Sharded Data Parallel blog](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/).
|
||||
We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
|
||||
All you need to do is enable it through the config.
|
||||
|
||||
**Required PyTorch version for FSDP support**: PyTorch Nightly (or 1.12.0 if you read this after it has been released)
|
||||
as the model saving with FSDP activated is only available with recent fixes.
|
||||
|
||||
**Usage**:
|
||||
|
||||
- Make sure you have added the distributed launcher
|
||||
`-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE` if you haven't been using it already.
|
||||
|
||||
- **Sharding Strategy**:
|
||||
- FULL_SHARD : Shards optimizer states + gradients + model parameters across data parallel workers/GPUs.
|
||||
For this, add `--fsdp full_shard` to the command line arguments.
|
||||
- SHARD_GRAD_OP : Shards optimizer states + gradients across data parallel workers/GPUs.
|
||||
For this, add `--fsdp shard_grad_op` to the command line arguments.
|
||||
- To offload the parameters and gradients to the CPU,
|
||||
add `--fsdp "full_shard offload"` or `--fsdp "shard_grad_op offload"` to the command line arguments.
|
||||
- To automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`,
|
||||
add `--fsdp "full_shard auto_wrap"` or `--fsdp "shard_grad_op auto_wrap"` to the command line arguments.
|
||||
- To enable both CPU offloading and auto wrapping,
|
||||
add `--fsdp "full_shard offload auto_wrap"` or `--fsdp "shard_grad_op offload auto_wrap"` to the command line arguments.
|
||||
- If auto wrapping is enabled, please add `--fsdp_min_num_params <number>` to command line arguments.
|
||||
It specifies FSDP's minimum number of parameters for Default Auto Wrapping.
|
||||
|
||||
**Few caveats to be aware of**
|
||||
- Mixed precision is currently not supported with FSDP as we wait for PyTorch to fix support for it.
|
||||
More details in this [issues](https://github.com/pytorch/pytorch/issues/75676).
|
||||
- FSDP currently doesn't support multiple parameter groups.
|
||||
More details mentioned in this [issue](https://github.com/pytorch/pytorch/issues/76501)
|
||||
(`The original model parameters' .grads are not set, meaning that they cannot be optimized separately (which is why we cannot support multiple parameter groups)`).
|
||||
|
||||
Sections that were moved:
|
||||
|
||||
|
||||
@ -122,6 +122,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
|
||||
|
||||
[[autodoc]] AutoModelForVision2Seq
|
||||
|
||||
## AutoModelForVisualQuestionAnswering
|
||||
|
||||
[[autodoc]] AutoModelForVisualQuestionAnswering
|
||||
|
||||
## AutoModelForAudioClassification
|
||||
|
||||
[[autodoc]] AutoModelForAudioClassification
|
||||
@ -194,6 +198,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
|
||||
|
||||
[[autodoc]] TFAutoModelForMultipleChoice
|
||||
|
||||
## TFAutoModelForNextSentencePrediction
|
||||
|
||||
[[autodoc]] TFAutoModelForNextSentencePrediction
|
||||
|
||||
## TFAutoModelForTableQuestionAnswering
|
||||
|
||||
[[autodoc]] TFAutoModelForTableQuestionAnswering
|
||||
|
||||
@ -49,7 +49,7 @@ Usage:
|
||||
|
||||
>>> input_ids = tokenizer(
|
||||
... "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
|
||||
>>> ).input_ids
|
||||
... ).input_ids
|
||||
>>> labels = tokenizer("This is a short summary", return_tensors="pt").input_ids
|
||||
|
||||
>>> # train...
|
||||
@ -67,7 +67,7 @@ Usage:
|
||||
|
||||
>>> input_ids = tokenizer(
|
||||
... "This is the first sentence. This is the second sentence.", add_special_tokens=False, return_tensors="pt"
|
||||
>>> ).input_ids
|
||||
... ).input_ids
|
||||
|
||||
>>> outputs = sentence_fuser.generate(input_ids)
|
||||
|
||||
|
||||
@ -166,6 +166,11 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
|
||||
[[autodoc]] FlaxBertForPreTraining
|
||||
- __call__
|
||||
|
||||
## FlaxBertForCausalLM
|
||||
|
||||
[[autodoc]] FlaxBertForCausalLM
|
||||
- __call__
|
||||
|
||||
## FlaxBertForMaskedLM
|
||||
|
||||
[[autodoc]] FlaxBertForMaskedLM
|
||||
|
||||
@ -120,6 +120,11 @@ This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta
|
||||
[[autodoc]] FlaxBigBirdForPreTraining
|
||||
- __call__
|
||||
|
||||
## FlaxBigBirdForCausalLM
|
||||
|
||||
[[autodoc]] FlaxBigBirdForCausalLM
|
||||
- __call__
|
||||
|
||||
## FlaxBigBirdForMaskedLM
|
||||
|
||||
[[autodoc]] FlaxBigBirdForMaskedLM
|
||||
|
||||
57
docs/source/en/model_doc/bloom.mdx
Normal file
57
docs/source/en/model_doc/bloom.mdx
Normal file
@ -0,0 +1,57 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# BLOOM
|
||||
|
||||
## Overview
|
||||
|
||||
The BLOOM model has been proposed with its various versions through the [BigScience Workshop](https://bigscience.huggingface.co/). BigScience is inspired by other open science initiatives where researchers have pooled their time and resources to collectively achieve a higher impact.
|
||||
The architecture of BLOOM is essentially similar to GPT3 (auto-regressive model for next token prediction), but has been trained on different 46 languages including code.
|
||||
Several smaller versions of the models have been trained on the same dataset. BLOOM is available in the following versions:
|
||||
|
||||
- [bloom-350m](https://huggingface.co/bigscience/bloom-350m)
|
||||
- [bloom-760m](https://huggingface.co/bigscience/bloom-760m)
|
||||
- [bloom-1b3](https://huggingface.co/bigscience/bloom-1b3)
|
||||
- [bloom-2b5](https://huggingface.co/bigscience/bloom-2b5)
|
||||
- [bloom-6b3](https://huggingface.co/bigscience/bloom-6b3)
|
||||
- [bloom](https://huggingface.co/bigscience/bloom) (175B parameters)
|
||||
|
||||
|
||||
## BloomConfig
|
||||
|
||||
[[autodoc]] BloomConfig
|
||||
- all
|
||||
|
||||
## BloomModel
|
||||
|
||||
[[autodoc]] BloomModel
|
||||
- forward
|
||||
|
||||
## BloomTokenizerFast
|
||||
|
||||
[[autodoc]] BloomTokenizerFast
|
||||
- all
|
||||
|
||||
## BloomForCausalLM
|
||||
|
||||
[[autodoc]] BloomForCausalLM
|
||||
- forward
|
||||
|
||||
## BloomForSequenceClassification
|
||||
|
||||
[[autodoc]] BloomForSequenceClassification
|
||||
- forward
|
||||
|
||||
## BloomForTokenClassification
|
||||
|
||||
[[autodoc]] BloomForTokenClassification
|
||||
- forward
|
||||
@ -38,3 +38,7 @@ Note: We only have a tokenizer here, since the model architecture is the same as
|
||||
## CpmTokenizer
|
||||
|
||||
[[autodoc]] CpmTokenizer
|
||||
|
||||
## CpmTokenizerFast
|
||||
|
||||
[[autodoc]] CpmTokenizerFast
|
||||
|
||||
53
docs/source/en/model_doc/cvt.mdx
Normal file
53
docs/source/en/model_doc/cvt.mdx
Normal file
@ -0,0 +1,53 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Convolutional Vision Transformer (CvT)
|
||||
|
||||
## Overview
|
||||
|
||||
The CvT model was proposed in [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan and Lei Zhang. The Convolutional vision Transformer (CvT) improves the [Vision Transformer (ViT)](vit) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We present in this paper a new architecture, named Convolutional vision Transformer (CvT), that improves Vision Transformer (ViT)
|
||||
in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. This is accomplished through
|
||||
two primary modifications: a hierarchy of Transformers containing a new convolutional token embedding, and a convolutional Transformer
|
||||
block leveraging a convolutional projection. These changes introduce desirable properties of convolutional neural networks (CNNs)
|
||||
to the ViT architecture (\ie shift, scale, and distortion invariance) while maintaining the merits of Transformers (\ie dynamic attention,
|
||||
global context, and better generalization). We validate CvT by conducting extensive experiments, showing that this approach achieves
|
||||
state-of-the-art performance over other Vision Transformers and ResNets on ImageNet-1k, with fewer parameters and lower FLOPs. In addition,
|
||||
performance gains are maintained when pretrained on larger datasets (\eg ImageNet-22k) and fine-tuned to downstream tasks. Pre-trained on
|
||||
ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. Finally, our results show that the positional encoding,
|
||||
a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks.*
|
||||
|
||||
Tips:
|
||||
|
||||
- CvT models are regular Vision Transformers, but trained with convolutions. They outperform the [original model (ViT)](vit) when fine-tuned on ImageNet-1K and CIFAR-100.
|
||||
- You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace [`ViTFeatureExtractor`] by [`AutoFeatureExtractor`] and [`ViTForImageClassification`] by [`CvtForImageClassification`]).
|
||||
- The available checkpoints are either (1) pre-trained on [ImageNet-22k](http://www.image-net.org/) (a collection of 14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
|
||||
images and 1,000 classes).
|
||||
|
||||
This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/microsoft/CvT).
|
||||
|
||||
## CvtConfig
|
||||
|
||||
[[autodoc]] CvtConfig
|
||||
|
||||
## CvtModel
|
||||
|
||||
[[autodoc]] CvtModel
|
||||
- forward
|
||||
|
||||
## CvtForImageClassification
|
||||
|
||||
[[autodoc]] CvtForImageClassification
|
||||
- forward
|
||||
@ -33,11 +33,19 @@ Models and code are available at www.github.com/pytorch/fairseq/tree/master/exam
|
||||
|
||||
Tips:
|
||||
|
||||
- Both Data2VecAudio and Data2VecText have been trained using the same self-supervised learning method.
|
||||
In the case of Data2VecAudio, preprocessing is identical to [`RobertaModel`], including tokenization.
|
||||
- Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method.
|
||||
- For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
|
||||
- For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
|
||||
- For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
|
||||
- To know how a pre-trained Data2Vec vision model can be fine-tuned on the task of image classification, you can check out
|
||||
[this notebook](https://colab.research.google.com/github/sayakpaul/TF-2.0-Hacks/blob/master/data2vec_vision_image_classification.ipynb).
|
||||
|
||||
This model was contributed by [edugp](https://huggingface.co/edugp).
|
||||
The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
|
||||
|
||||
This model was contributed by [edugp](https://huggingface.co/edugp) and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
|
||||
[sayakpaul](https://github.com/sayakpaul) and [Rocketknight1](https://github.com/Rocketknight1) contributed Data2Vec for vision in TensorFlow.
|
||||
|
||||
The original code (for NLP and Speech) can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
|
||||
The original code for vision can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit).
|
||||
|
||||
|
||||
## Data2VecTextConfig
|
||||
@ -48,12 +56,16 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
|
||||
|
||||
[[autodoc]] Data2VecAudioConfig
|
||||
|
||||
## Data2VecVisionConfig
|
||||
|
||||
[[autodoc]] Data2VecVisionConfig
|
||||
|
||||
|
||||
## Data2VecAudioModel
|
||||
|
||||
[[autodoc]] Data2VecAudioModel
|
||||
- forward
|
||||
|
||||
|
||||
## Data2VecAudioForAudioFrameClassification
|
||||
|
||||
[[autodoc]] Data2VecAudioForAudioFrameClassification
|
||||
@ -108,3 +120,33 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
|
||||
|
||||
[[autodoc]] Data2VecTextForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## Data2VecVisionModel
|
||||
|
||||
[[autodoc]] Data2VecVisionModel
|
||||
- forward
|
||||
|
||||
## Data2VecVisionForImageClassification
|
||||
|
||||
[[autodoc]] Data2VecVisionForImageClassification
|
||||
- forward
|
||||
|
||||
## Data2VecVisionForSemanticSegmentation
|
||||
|
||||
[[autodoc]] Data2VecVisionForSemanticSegmentation
|
||||
- forward
|
||||
|
||||
## TFData2VecVisionModel
|
||||
|
||||
[[autodoc]] TFData2VecVisionModel
|
||||
- call
|
||||
|
||||
## TFData2VecVisionForImageClassification
|
||||
|
||||
[[autodoc]] TFData2VecVisionForImageClassification
|
||||
- call
|
||||
|
||||
## TFData2VecVisionForSemanticSegmentation
|
||||
|
||||
[[autodoc]] TFData2VecVisionForSemanticSegmentation
|
||||
- call
|
||||
|
||||
@ -71,6 +71,12 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
|
||||
## DebertaV2TokenizerFast
|
||||
|
||||
[[autodoc]] DebertaV2TokenizerFast
|
||||
- build_inputs_with_special_tokens
|
||||
- create_token_type_ids_from_sequences
|
||||
|
||||
## DebertaV2Model
|
||||
|
||||
[[autodoc]] DebertaV2Model
|
||||
@ -101,6 +107,11 @@ contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code
|
||||
[[autodoc]] DebertaV2ForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## DebertaV2ForMultipleChoice
|
||||
|
||||
[[autodoc]] DebertaV2ForMultipleChoice
|
||||
- forward
|
||||
|
||||
## TFDebertaV2Model
|
||||
|
||||
[[autodoc]] TFDebertaV2Model
|
||||
|
||||
@ -158,6 +158,11 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). The o
|
||||
[[autodoc]] FlaxElectraForPreTraining
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForCausalLM
|
||||
|
||||
[[autodoc]] FlaxElectraForCausalLM
|
||||
- __call__
|
||||
|
||||
## FlaxElectraForMaskedLM
|
||||
|
||||
[[autodoc]] FlaxElectraForMaskedLM
|
||||
|
||||
96
docs/source/en/model_doc/flava.mdx
Normal file
96
docs/source/en/model_doc/flava.mdx
Normal file
@ -0,0 +1,96 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# FLAVA
|
||||
|
||||
## Overview
|
||||
|
||||
The FLAVA model was proposed in [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela and is accepted at CVPR 2022.
|
||||
|
||||
The paper aims at creating a single unified foundation model which can work across vision, language
|
||||
as well as vision-and-language multimodal tasks.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*State-of-the-art vision and vision-and-language models rely on large-scale visio-linguistic pretraining for obtaining good performance on a variety
|
||||
of downstream tasks. Generally, such models are often either cross-modal (contrastive) or multi-modal
|
||||
(with earlier fusion) but not both; and they often only target specific modalities or tasks. A promising
|
||||
direction would be to use a single holistic universal model, as a "foundation", that targets all modalities
|
||||
at once -- a true vision and language foundation model should be good at vision tasks, language tasks, and
|
||||
cross- and multi-modal vision and language tasks. We introduce FLAVA as such a model and demonstrate
|
||||
impressive performance on a wide range of 35 tasks spanning these target modalities.*
|
||||
|
||||
|
||||
This model was contributed by [aps](https://huggingface.co/aps). The original code can be found [here](https://github.com/facebookresearch/multimodal/tree/main/examples/flava).
|
||||
|
||||
|
||||
## FlavaConfig
|
||||
|
||||
[[autodoc]] FlavaConfig
|
||||
|
||||
## FlavaTextConfig
|
||||
|
||||
[[autodoc]] FlavaTextConfig
|
||||
|
||||
## FlavaImageConfig
|
||||
|
||||
[[autodoc]] FlavaImageConfig
|
||||
|
||||
## FlavaMultimodalConfig
|
||||
|
||||
[[autodoc]] FlavaMultimodalConfig
|
||||
|
||||
## FlavaImageCodebookConfig
|
||||
|
||||
[[autodoc]] FlavaImageCodebookConfig
|
||||
|
||||
## FlavaProcessor
|
||||
|
||||
[[autodoc]] FlavaProcessor
|
||||
|
||||
## FlavaFeatureExtractor
|
||||
|
||||
[[autodoc]] FlavaFeatureExtractor
|
||||
|
||||
## FlavaForPreTraining
|
||||
|
||||
[[autodoc]] FlavaForPreTraining
|
||||
- forward
|
||||
|
||||
## FlavaModel
|
||||
|
||||
[[autodoc]] FlavaModel
|
||||
- forward
|
||||
- get_text_features
|
||||
- get_image_features
|
||||
|
||||
## FlavaImageCodebook
|
||||
|
||||
[[autodoc]] FlavaImageCodebook
|
||||
- forward
|
||||
- get_codebook_indices
|
||||
- get_codebook_probs
|
||||
|
||||
## FlavaTextModel
|
||||
|
||||
[[autodoc]] FlavaTextModel
|
||||
- forward
|
||||
|
||||
## FlavaImageModel
|
||||
|
||||
[[autodoc]] FlavaImageModel
|
||||
- forward
|
||||
|
||||
## FlavaMultimodalModel
|
||||
|
||||
[[autodoc]] FlavaMultimodalModel
|
||||
- forward
|
||||
76
docs/source/en/model_doc/gpt_neox.mdx
Normal file
76
docs/source/en/model_doc/gpt_neox.mdx
Normal file
@ -0,0 +1,76 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# GPT-NeoX
|
||||
|
||||
## Overview
|
||||
|
||||
We introduce GPT-NeoX-20B, a 20 billion parameter autoregressive language model trained on the Pile, whose weights will
|
||||
be made freely and openly available to the public through a permissive license. It is, to the best of our knowledge,
|
||||
the largest dense autoregressive model that has publicly available weights at the time of submission. In this work,
|
||||
we describe GPT-NeoX-20B's architecture and training and evaluate its performance on a range of language-understanding,
|
||||
mathematics, and knowledge-based tasks. We find that GPT-NeoX-20B is a particularly powerful few-shot reasoner and
|
||||
gains far more in performance when evaluated five-shot than similarly sized GPT-3 and FairSeq models. We open-source
|
||||
the training and evaluation code, as well as the model weights, at [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox).
|
||||
|
||||
Development of the model was led by Sid Black, Stella Biderman and Eric Hallahan, and the model was trained with
|
||||
generous the support of [CoreWeave](https://www.coreweave.com/).
|
||||
|
||||
GPT-NeoX-20B was trained with fp16, thus it is recommended to initialize the model as follows:
|
||||
|
||||
```python
|
||||
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b").half().cuda()
|
||||
```
|
||||
|
||||
GPT-NeoX-20B also has a different tokenizer from the one used in GPT-J-6B and GPT-Neo. The new tokenizer allocates
|
||||
additional tokens to whitespace characters, making the model more suitable for certain tasks like code generation.
|
||||
|
||||
### Generation
|
||||
|
||||
The `generate()` method can be used to generate text using GPT Neo model.
|
||||
|
||||
```python
|
||||
>>> from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast
|
||||
|
||||
>>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b")
|
||||
>>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")
|
||||
|
||||
>>> prompt = "GPTNeoX20B is a 20B-parameter autoregressive Transformer model developed by EleutherAI."
|
||||
|
||||
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
|
||||
|
||||
>>> gen_tokens = model.generate(
|
||||
... input_ids,
|
||||
... do_sample=True,
|
||||
... temperature=0.9,
|
||||
... max_length=100,
|
||||
... )
|
||||
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
|
||||
```
|
||||
|
||||
## GPTNeoXConfig
|
||||
|
||||
[[autodoc]] GPTNeoXConfig
|
||||
|
||||
## GPTNeoXTokenizerFast
|
||||
|
||||
[[autodoc]] GPTNeoXTokenizerFast
|
||||
|
||||
## GPTNeoXModel
|
||||
|
||||
[[autodoc]] GPTNeoXModel
|
||||
- forward
|
||||
|
||||
## GPTNeoXForCausalLM
|
||||
|
||||
[[autodoc]] GPTNeoXForCausalLM
|
||||
- forward
|
||||
@ -44,6 +44,14 @@ including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.
|
||||
RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
|
||||
this https URL.*
|
||||
|
||||
LayoutLMv2 depends on `detectron2`, `torchvision` and `tesseract`. Run the
|
||||
following to install them:
|
||||
```
|
||||
python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
|
||||
python -m pip install torchvision tesseract
|
||||
```
|
||||
(If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.)
|
||||
|
||||
Tips:
|
||||
|
||||
- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
|
||||
|
||||
85
docs/source/en/model_doc/layoutlmv3.mdx
Normal file
85
docs/source/en/model_doc/layoutlmv3.mdx
Normal file
@ -0,0 +1,85 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# LayoutLMv3
|
||||
|
||||
## Overview
|
||||
|
||||
The LayoutLMv3 model was proposed in [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
|
||||
LayoutLMv3 simplifies [LayoutLMv2](layoutlmv2) by using patch embeddings (as in [ViT](vit)) instead of leveraging a CNN backbone, and pre-trains the model on 3 objectives: masked language modeling (MLM), masked image modeling (MIM)
|
||||
and word-patch alignment (WPA).
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pre-trained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in image-centric tasks such as document image classification and document layout analysis.*
|
||||
|
||||
Tips:
|
||||
|
||||
- In terms of data processing, LayoutLMv3 is identical to its predecessor [LayoutLMv2](layoutlmv2), except that:
|
||||
- images need to be resized and normalized with channels in regular RGB format. LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format.
|
||||
- text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece.
|
||||
Due to these differences in data preprocessing, one can use [`LayoutLMv3Processor`] which internally combines a [`LayoutLMv3FeatureExtractor`] (for the image modality) and a [`LayoutLMv3Tokenizer`]/[`LayoutLMv3TokenizerFast`] (for the text modality) to prepare all data for the model.
|
||||
- Regarding usage of [`LayoutLMv3Processor`], we refer to the [usage guide](layoutlmv2#usage-LayoutLMv2Processor) of its predecessor.
|
||||
- Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/layoutlmv3_architecture.png"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> LayoutLMv3 architecture. Taken from the <a href="https://arxiv.org/abs/2204.08387">original paper</a>. </small>
|
||||
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/layoutlmv3).
|
||||
|
||||
|
||||
## LayoutLMv3Config
|
||||
|
||||
[[autodoc]] LayoutLMv3Config
|
||||
|
||||
## LayoutLMv3FeatureExtractor
|
||||
|
||||
[[autodoc]] LayoutLMv3FeatureExtractor
|
||||
- __call__
|
||||
|
||||
## LayoutLMv3Tokenizer
|
||||
|
||||
[[autodoc]] LayoutLMv3Tokenizer
|
||||
- __call__
|
||||
- save_vocabulary
|
||||
|
||||
## LayoutLMv3TokenizerFast
|
||||
|
||||
[[autodoc]] LayoutLMv3TokenizerFast
|
||||
- __call__
|
||||
|
||||
## LayoutLMv3Processor
|
||||
|
||||
[[autodoc]] LayoutLMv3Processor
|
||||
- __call__
|
||||
|
||||
## LayoutLMv3Model
|
||||
|
||||
[[autodoc]] LayoutLMv3Model
|
||||
- forward
|
||||
|
||||
## LayoutLMv3ForSequenceClassification
|
||||
|
||||
[[autodoc]] LayoutLMv3ForSequenceClassification
|
||||
- forward
|
||||
|
||||
## LayoutLMv3ForTokenClassification
|
||||
|
||||
[[autodoc]] LayoutLMv3ForTokenClassification
|
||||
- forward
|
||||
|
||||
## LayoutLMv3ForQuestionAnswering
|
||||
|
||||
[[autodoc]] LayoutLMv3ForQuestionAnswering
|
||||
- forward
|
||||
@ -44,8 +44,10 @@ Tips:
|
||||
- LED makes use of *global attention* by means of the `global_attention_mask` (see
|
||||
[`LongformerModel`]). For summarization, it is advised to put *global attention* only on the first
|
||||
`<s>` token. For question answering, it is advised to put *global attention* on all tokens of the question.
|
||||
- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by executing
|
||||
`model.gradient_checkpointing_enable()`.
|
||||
- To fine-tune LED on all 16384, *gradient checkpointing* can be enabled in case training leads to out-of-memory (OOM)
|
||||
errors. This can be done by executing `model.gradient_checkpointing_enable()`.
|
||||
Moreover, the `use_cache=False`
|
||||
flag can be used to disable the caching mechanism to save memory.
|
||||
- A notebook showing how to evaluate LED, can be accessed [here](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing).
|
||||
- A notebook showing how to fine-tune LED, can be accessed [here](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing).
|
||||
|
||||
|
||||
87
docs/source/en/model_doc/levit.mdx
Normal file
87
docs/source/en/model_doc/levit.mdx
Normal file
@ -0,0 +1,87 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# LeViT
|
||||
|
||||
## Overview
|
||||
|
||||
The LeViT model was proposed in [LeViT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. LeViT improves the [Vision Transformer (ViT)](vit) in performance and efficiency by a few architectural differences such as activation maps with decreasing resolutions in Transformers and the introduction of an attention bias to integrate positional information.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We design a family of image classification architectures that optimize the trade-off between accuracy
|
||||
and efficiency in a high-speed regime. Our work exploits recent findings in attention-based architectures,
|
||||
which are competitive on highly parallel processing hardware. We revisit principles from the extensive
|
||||
literature on convolutional neural networks to apply them to transformers, in particular activation maps
|
||||
with decreasing resolutions. We also introduce the attention bias, a new way to integrate positional information
|
||||
in vision transformers. As a result, we propose LeVIT: a hybrid neural network for fast inference image classification.
|
||||
We consider different measures of efficiency on different hardware platforms, so as to best reflect a wide range of
|
||||
application scenarios. Our extensive experiments empirically validate our technical choices and show they are suitable
|
||||
to most architectures. Overall, LeViT significantly outperforms existing convnets and vision transformers with respect
|
||||
to the speed/accuracy tradeoff. For example, at 80% ImageNet top-1 accuracy, LeViT is 5 times faster than EfficientNet on CPU. *
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/levit_architecture.png"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> LeViT Architecture. Taken from the <a href="https://arxiv.org/abs/2104.01136">original paper</a>.</small>
|
||||
|
||||
Tips:
|
||||
|
||||
- Compared to ViT, LeViT models use an additional distillation head to effectively learn from a teacher (which, in the LeViT paper, is a ResNet like-model). The distillation head is learned through backpropagation under supervision of a ResNet like-model. They also draw inspiration from convolution neural networks to use activation maps with decreasing resolutions to increase the efficiency.
|
||||
- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
|
||||
of the final hidden state and not using the distillation head, or (2) by placing both a prediction head and distillation
|
||||
head on top of the final hidden state. In that case, the prediction head is trained using regular cross-entropy between
|
||||
the prediction of the head and the ground-truth label, while the distillation prediction head is trained using hard distillation
|
||||
(cross-entropy between the prediction of the distillation head and the label predicted by the teacher). At inference time,
|
||||
one takes the average prediction between both heads as final prediction. (2) is also called "fine-tuning with distillation",
|
||||
because one relies on a teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds
|
||||
to [`LevitForImageClassification`] and (2) corresponds to [`LevitForImageClassificationWithTeacher`].
|
||||
- All released checkpoints were pre-trained and fine-tuned on [ImageNet-1k](https://huggingface.co/datasets/imagenet-1k)
|
||||
(also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). only. No external data was used. This is in
|
||||
contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
|
||||
pre-training.
|
||||
- The authors of LeViT released 5 trained LeViT models, which you can directly plug into [`LevitModel`] or [`LevitForImageClassification`].
|
||||
Techniques like data augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
|
||||
(while only using ImageNet-1k for pre-training). The 5 variants available are (all trained on images of size 224x224):
|
||||
*facebook/levit-128S*, *facebook/levit-128*, *facebook/levit-192*, *facebook/levit-256* and
|
||||
*facebook/levit-384*. Note that one should use [`LevitFeatureExtractor`] in order to
|
||||
prepare images for the model.
|
||||
- [`LevitForImageClassificationWithTeacher`] currently supports only inference and not training or fine-tuning.
|
||||
- You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer)
|
||||
(you can just replace [`ViTFeatureExtractor`] by [`LevitFeatureExtractor`] and [`ViTForImageClassification`] by [`LevitForImageClassification`] or [`LevitForImageClassificationWithTeacher`]).
|
||||
|
||||
This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/facebookresearch/LeViT).
|
||||
|
||||
|
||||
## LevitConfig
|
||||
|
||||
[[autodoc]] LevitConfig
|
||||
|
||||
## LevitFeatureExtractor
|
||||
|
||||
[[autodoc]] LevitFeatureExtractor
|
||||
- __call__
|
||||
|
||||
## LevitModel
|
||||
|
||||
[[autodoc]] LevitModel
|
||||
- forward
|
||||
|
||||
## LevitForImageClassification
|
||||
|
||||
[[autodoc]] LevitForImageClassification
|
||||
- forward
|
||||
|
||||
## LevitForImageClassificationWithTeacher
|
||||
|
||||
[[autodoc]] LevitForImageClassificationWithTeacher
|
||||
- forward
|
||||
121
docs/source/en/model_doc/longt5.mdx
Normal file
121
docs/source/en/model_doc/longt5.mdx
Normal file
@ -0,0 +1,121 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# LongT5
|
||||
|
||||
## Overview
|
||||
|
||||
The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
|
||||
by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung and Yinfei Yang. It's an
|
||||
encoder-decoder transformer pre-trained in a text-to-text denoising generative setting. LongT5 model is an extension of
|
||||
T5 model, and it enables using one of the two different efficient attention mechanisms - (1) Local attention, or (2)
|
||||
Transient-Global attention.
|
||||
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Recent work has shown that either (1) increasing the input length or (2) increasing model size can improve the
|
||||
performance of Transformer-based neural models. In this paper, we present a new model, called LongT5, with which we
|
||||
explore the effects of scaling both the input length and model size at the same time. Specifically, we integrated
|
||||
attention ideas from long-input transformers (ETC), and adopted pre-training strategies from summarization pre-training
|
||||
(PEGASUS) into the scalable T5 architecture. The result is a new attention mechanism we call {\em Transient Global}
|
||||
(TGlobal), which mimics ETC's local/global attention mechanism, but without requiring additional side-inputs. We are
|
||||
able to achieve state-of-the-art results on several summarization tasks and outperform the original T5 models on
|
||||
question answering tasks.*
|
||||
|
||||
Tips:
|
||||
|
||||
- [`LongT5ForConditionalGeneration`] is an extension of [`T5ForConditionalGeneration`] exchanging the traditional
|
||||
encoder *self-attention* layer with efficient either *local* attention or *transient-global* (*tglobal*) attention.
|
||||
- Unlike the T5 model, LongT5 does not use a task prefix. Furthermore, it uses a different pre-training objective
|
||||
inspired by the pre-training of `[PegasusForConditionalGeneration]`.
|
||||
- LongT5 model is designed to work efficiently and very well on long-range *sequence-to-sequence* tasks where the
|
||||
input sequence exceeds commonly used 512 tokens. It is capable of handling input sequences of a length up to 16,384 tokens.
|
||||
- For *Local Attention*, the sparse sliding-window local attention operation allows a given token to attend only `r`
|
||||
tokens to the left and right of it (with `r=127` by default). *Local Attention* does not introduce any new parameters
|
||||
to the model. The complexity of the mechanism is linear in input sequence length `l`: `O(l*r)`.
|
||||
- *Transient Global Attention* is an extension of the *Local Attention*. It, furthermore, allows each input token to
|
||||
interact with all other tokens in the layer. This is achieved via splitting an input sequence into blocks of a fixed
|
||||
length `k` (with a default `k=16`). Then, a global token for such a block is obtained via summing and normalizing the embeddings of every token
|
||||
in the block. Thanks to this, the attention allows each token to attend to both nearby tokens like in Local attention, and
|
||||
also every global token like in the case of standard global attention (*transient* represents the fact the global tokens
|
||||
are constructed dynamically within each attention operation). As a consequence, *TGlobal* attention introduces
|
||||
a few new parameters -- global relative position biases and a layer normalization for global token's embedding.
|
||||
The complexity of this mechanism is `O(l(r + l/k))`.
|
||||
- An example showing how to evaluate a fine-tuned LongT5 model on the [pubmed dataset](https://huggingface.co/datasets/scientific_papers) is below.
|
||||
|
||||
```python
|
||||
>>> import evaluate
|
||||
>>> from datasets import load_dataset
|
||||
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration
|
||||
|
||||
>>> dataset = load_dataset("scientific_papers", "pubmed", split="validation")
|
||||
>>> model = (
|
||||
... LongT5ForConditionalGeneration.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
|
||||
... .to("cuda")
|
||||
... .half()
|
||||
... )
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
|
||||
|
||||
|
||||
>>> def generate_answers(batch):
|
||||
... inputs_dict = tokenizer(
|
||||
... batch["article"], max_length=16384, padding="max_length", truncation=True, return_tensors="pt"
|
||||
... )
|
||||
... input_ids = inputs_dict.input_ids.to("cuda")
|
||||
... attention_mask = inputs_dict.attention_mask.to("cuda")
|
||||
... output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=512, num_beams=2)
|
||||
... batch["predicted_abstract"] = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
||||
... return batch
|
||||
|
||||
|
||||
>>> result = dataset.map(generate_answer, batched=True, batch_size=2)
|
||||
>>> rouge = evaluate.load("rouge")
|
||||
>>> rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"])
|
||||
```
|
||||
|
||||
This model was contributed by [stancld](https://huggingface.co/stancld).
|
||||
The original code can be found [here](https://github.com/google-research/longt5).
|
||||
|
||||
|
||||
## LongT5Config
|
||||
|
||||
[[autodoc]] LongT5Config
|
||||
|
||||
## LongT5Model
|
||||
|
||||
[[autodoc]] LongT5Model
|
||||
- forward
|
||||
|
||||
## LongT5ForConditionalGeneration
|
||||
|
||||
[[autodoc]] LongT5ForConditionalGeneration
|
||||
- forward
|
||||
|
||||
## LongT5EncoderModel
|
||||
|
||||
[[autodoc]] LongT5EncoderModel
|
||||
- forward
|
||||
|
||||
## FlaxLongT5Model
|
||||
|
||||
[[autodoc]] FlaxLongT5Model
|
||||
- __call__
|
||||
- encode
|
||||
- decode
|
||||
|
||||
## FlaxLongT5ForConditionalGeneration
|
||||
|
||||
[[autodoc]] FlaxLongT5ForConditionalGeneration
|
||||
- __call__
|
||||
- encode
|
||||
- decode
|
||||
@ -97,7 +97,7 @@ Example:
|
||||
>>> entities = [
|
||||
... "Beyoncé",
|
||||
... "Los Angeles",
|
||||
>>> ] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
|
||||
... ] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
|
||||
>>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
|
||||
>>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
62
docs/source/en/model_doc/mctct.mdx
Normal file
62
docs/source/en/model_doc/mctct.mdx
Normal file
@ -0,0 +1,62 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# M-CTC-T
|
||||
|
||||
## Overview
|
||||
|
||||
The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Semi-supervised learning through pseudo-labeling has become a staple of state-of-the-art monolingual
|
||||
speech recognition systems. In this work, we extend pseudo-labeling to massively multilingual speech
|
||||
recognition with 60 languages. We propose a simple pseudo-labeling recipe that works well even
|
||||
with low-resource languages: train a supervised multilingual model, fine-tune it with semi-supervised
|
||||
learning on a target language, generate pseudo-labels for that language, and train a final model using
|
||||
pseudo-labels for all languages, either from scratch or by fine-tuning. Experiments on the labeled
|
||||
Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better
|
||||
performance for many languages that also transfers well to LibriSpeech.*
|
||||
|
||||
|
||||
|
||||
This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The original code can be found [here](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl).
|
||||
|
||||
## MCTCTConfig
|
||||
|
||||
[[autodoc]] MCTCTConfig
|
||||
|
||||
## MCTCTFeatureExtractor
|
||||
|
||||
[[autodoc]] MCTCTFeatureExtractor
|
||||
- __call__
|
||||
|
||||
## MCTCTProcessor
|
||||
|
||||
[[autodoc]] MCTCTProcessor
|
||||
- __call__
|
||||
- from_pretrained
|
||||
- save_pretrained
|
||||
- batch_decode
|
||||
- decode
|
||||
- as_target_processor
|
||||
|
||||
|
||||
## MCTCTModel
|
||||
|
||||
[[autodoc]] MCTCTModel
|
||||
- forward
|
||||
|
||||
## MCTCTForCTC
|
||||
|
||||
[[autodoc]] MCTCTForCTC
|
||||
- forward
|
||||
66
docs/source/en/model_doc/opt.mdx
Normal file
66
docs/source/en/model_doc/opt.mdx
Normal file
@ -0,0 +1,66 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# OPT
|
||||
|
||||
## Overview
|
||||
|
||||
The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://arxiv.org/pdf/2205.01068) by Meta AI.
|
||||
OPT is a series of open-sourced large causal language models which perform similar in performance to GPT3.
|
||||
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*
|
||||
|
||||
Tips:
|
||||
- OPT has the same architecture as [`BartDecoder`].
|
||||
- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt. **Note**: Make sure to pass `use_fast=False` when loading OPT's tokenizer with [`AutoTokenizer`] to get the correct tokenizer.
|
||||
|
||||
This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
|
||||
The original code can be found [here](https://github.com/facebookresearch/metaseq).
|
||||
|
||||
|
||||
## OPTConfig
|
||||
|
||||
[[autodoc]] OPTConfig
|
||||
|
||||
## OPTModel
|
||||
|
||||
[[autodoc]] OPTModel
|
||||
- forward
|
||||
|
||||
## OPTForCausalLM
|
||||
|
||||
[[autodoc]] OPTForCausalLM
|
||||
- forward
|
||||
|
||||
## TFOPTModel
|
||||
|
||||
[[autodoc]] TFOPTModel
|
||||
- call
|
||||
|
||||
## TFOPTForCausalLM
|
||||
|
||||
[[autodoc]] TFOPTForCausalLM
|
||||
- call
|
||||
|
||||
## FlaxOPTModel
|
||||
|
||||
[[autodoc]] FlaxOPTModel
|
||||
- __call__
|
||||
|
||||
|
||||
## FlaxOPTForCausalLM
|
||||
|
||||
[[autodoc]] FlaxOPTForCausalLM
|
||||
- __call__
|
||||
@ -136,6 +136,11 @@ This model was contributed by [julien-c](https://huggingface.co/julien-c). The o
|
||||
[[autodoc]] FlaxRobertaModel
|
||||
- __call__
|
||||
|
||||
## FlaxRobertaForCausalLM
|
||||
|
||||
[[autodoc]] FlaxRobertaForCausalLM
|
||||
- __call__
|
||||
|
||||
## FlaxRobertaForMaskedLM
|
||||
|
||||
[[autodoc]] FlaxRobertaForMaskedLM
|
||||
|
||||
@ -72,3 +72,8 @@ This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirsta
|
||||
|
||||
[[autodoc]] SplinterForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## SplinterForPreTraining
|
||||
|
||||
[[autodoc]] SplinterForPreTraining
|
||||
- forward
|
||||
|
||||
@ -14,22 +14,22 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
## Overview
|
||||
|
||||
The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
|
||||
by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
|
||||
by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone
|
||||
for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains,
|
||||
such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text.
|
||||
To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted
|
||||
\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping
|
||||
local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at
|
||||
various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it
|
||||
compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense
|
||||
prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation
|
||||
(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and
|
||||
+2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones.
|
||||
*This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone
|
||||
for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains,
|
||||
such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text.
|
||||
To address these differences, we propose a hierarchical Transformer whose representation is computed with \bold{S}hifted
|
||||
\bold{win}dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping
|
||||
local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at
|
||||
various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it
|
||||
compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense
|
||||
prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation
|
||||
(53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and
|
||||
+2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones.
|
||||
The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures.*
|
||||
|
||||
Tips:
|
||||
@ -38,11 +38,11 @@ Tips:
|
||||
- Swin can be used as a *backbone*. When `output_hidden_states = True`, it will output both `hidden_states` and `reshaped_hidden_states`. The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than `(batch_size, sequence_length, num_channels)`.
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/swin_transformer_architecture.png"
|
||||
alt="drawing" width="600"/>
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> Swin Transformer architecture. Taken from the <a href="https://arxiv.org/abs/2102.03334">original paper</a>.</small>
|
||||
|
||||
This model was contributed by [novice03](https://huggingface.co/novice03>). The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
|
||||
This model was contributed by [novice03](https://huggingface.co/novice03>). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). The original code can be found [here](https://github.com/microsoft/Swin-Transformer).
|
||||
|
||||
|
||||
## SwinConfig
|
||||
@ -63,4 +63,19 @@ This model was contributed by [novice03](https://huggingface.co/novice03>). The
|
||||
## SwinForImageClassification
|
||||
|
||||
[[autodoc]] transformers.SwinForImageClassification
|
||||
- forward
|
||||
- forward
|
||||
|
||||
## TFSwinModel
|
||||
|
||||
[[autodoc]] TFSwinModel
|
||||
- call
|
||||
|
||||
## TFSwinForMaskedImageModeling
|
||||
|
||||
[[autodoc]] TFSwinForMaskedImageModeling
|
||||
- call
|
||||
|
||||
## TFSwinForImageClassification
|
||||
|
||||
[[autodoc]] transformers.TFSwinForImageClassification
|
||||
- call
|
||||
|
||||
@ -252,10 +252,9 @@ The example above only shows a single example. You can also do batched inference
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
|
||||
>>> task_prefix = "translate English to German: "
|
||||
>>> sentences = [
|
||||
... "The house is wonderful.",
|
||||
... "I like to work in NYC.",
|
||||
>>> ] # use different length sentences to test batching
|
||||
>>> # use different length sentences to test batching
|
||||
>>> sentences = ["The house is wonderful.", "I like to work in NYC."]
|
||||
|
||||
>>> inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
|
||||
|
||||
>>> output_sequences = model.generate(
|
||||
|
||||
49
docs/source/en/model_doc/trajectory_transformer.mdx
Normal file
49
docs/source/en/model_doc/trajectory_transformer.mdx
Normal file
@ -0,0 +1,49 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Trajectory Transformer
|
||||
|
||||
## Overview
|
||||
|
||||
The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models,
|
||||
leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence
|
||||
modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards.
|
||||
Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well
|
||||
in other domains, such as natural-language processing, can also provide effective solutions to the RL problem.
|
||||
To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture
|
||||
to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence
|
||||
modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common
|
||||
in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction,
|
||||
imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with
|
||||
existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.*
|
||||
|
||||
Tips:
|
||||
|
||||
This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from
|
||||
actions, states and rewards from all previous timesteps. This model will treat all these elements together
|
||||
as one big sequence (a trajectory).
|
||||
|
||||
This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
|
||||
|
||||
## TrajectoryTransformerConfig
|
||||
|
||||
[[autodoc]] TrajectoryTransformerConfig
|
||||
|
||||
|
||||
## TrajectoryTransformerModel
|
||||
|
||||
[[autodoc]] TrajectoryTransformerModel
|
||||
- forward
|
||||
@ -51,8 +51,6 @@ found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
|
||||
|
||||
## UniSpeechSat specific outputs
|
||||
|
||||
[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatBaseModelOutput
|
||||
|
||||
[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput
|
||||
|
||||
## UniSpeechSatModel
|
||||
|
||||
@ -46,8 +46,6 @@ found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
|
||||
|
||||
## UniSpeech specific outputs
|
||||
|
||||
[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechBaseModelOutput
|
||||
|
||||
[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput
|
||||
|
||||
## UniSpeechModel
|
||||
|
||||
73
docs/source/en/model_doc/wav2vec2-conformer.mdx
Normal file
73
docs/source/en/model_doc/wav2vec2-conformer.mdx
Normal file
@ -0,0 +1,73 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Wav2Vec2-Conformer
|
||||
|
||||
## Overview
|
||||
|
||||
The Wav2Vec2-Conformer was added to an updated version of [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
|
||||
|
||||
The official results of the model can be found in Table 3 and Table 4 of the paper.
|
||||
|
||||
The Wav2Vec2-Conformer weights were released by the Meta AI team within the [Fairseq library](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md#pre-trained-models).
|
||||
|
||||
Tips:
|
||||
|
||||
- Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the *Attention*-block with a *Conformer*-block
|
||||
as introduced in [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100).
|
||||
- For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields
|
||||
an improved word error rate.
|
||||
- Wav2Vec2-Conformer uses the same tokenizer and feature extractor as Wav2Vec2.
|
||||
- Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or
|
||||
rotary position embeddings by setting the correct `config.position_embeddings_type`.
|
||||
|
||||
This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
|
||||
The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec).
|
||||
|
||||
|
||||
## Wav2Vec2ConformerConfig
|
||||
|
||||
[[autodoc]] Wav2Vec2ConformerConfig
|
||||
|
||||
## Wav2Vec2Conformer specific outputs
|
||||
|
||||
[[autodoc]] models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForPreTrainingOutput
|
||||
|
||||
## Wav2Vec2ConformerModel
|
||||
|
||||
[[autodoc]] Wav2Vec2ConformerModel
|
||||
- forward
|
||||
|
||||
## Wav2Vec2ConformerForCTC
|
||||
|
||||
[[autodoc]] Wav2Vec2ConformerForCTC
|
||||
- forward
|
||||
|
||||
## Wav2Vec2ConformerForSequenceClassification
|
||||
|
||||
[[autodoc]] Wav2Vec2ConformerForSequenceClassification
|
||||
- forward
|
||||
|
||||
## Wav2Vec2ConformerForAudioFrameClassification
|
||||
|
||||
[[autodoc]] Wav2Vec2ConformerForAudioFrameClassification
|
||||
- forward
|
||||
|
||||
## Wav2Vec2ConformerForXVector
|
||||
|
||||
[[autodoc]] Wav2Vec2ConformerForXVector
|
||||
- forward
|
||||
|
||||
## Wav2Vec2ConformerForPreTraining
|
||||
|
||||
[[autodoc]] Wav2Vec2ConformerForPreTraining
|
||||
- forward
|
||||
@ -49,10 +49,6 @@ found [here](https://github.com/microsoft/unilm/tree/master/wavlm).
|
||||
|
||||
[[autodoc]] WavLMConfig
|
||||
|
||||
## WavLM specific outputs
|
||||
|
||||
[[autodoc]] models.wavlm.modeling_wavlm.WavLMBaseModelOutput
|
||||
|
||||
## WavLMModel
|
||||
|
||||
[[autodoc]] WavLMModel
|
||||
|
||||
60
docs/source/en/model_doc/yolos.mdx
Normal file
60
docs/source/en/model_doc/yolos.mdx
Normal file
@ -0,0 +1,60 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# YOLOS
|
||||
|
||||
## Overview
|
||||
|
||||
The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
|
||||
YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.*
|
||||
|
||||
Tips:
|
||||
|
||||
- One can use [`YolosFeatureExtractor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
|
||||
- Demo notebooks (regarding inference and fine-tuning on custom data) can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS).
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png"
|
||||
alt="drawing" width="600"/>
|
||||
|
||||
<small> YOLOS architecture. Taken from the <a href="https://arxiv.org/abs/2106.00666">original paper</a>.</small>
|
||||
|
||||
This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS).
|
||||
|
||||
## YolosConfig
|
||||
|
||||
[[autodoc]] YolosConfig
|
||||
|
||||
|
||||
## YolosFeatureExtractor
|
||||
|
||||
[[autodoc]] YolosFeatureExtractor
|
||||
- __call__
|
||||
- pad
|
||||
- post_process
|
||||
- post_process_segmentation
|
||||
- post_process_panoptic
|
||||
|
||||
|
||||
## YolosModel
|
||||
|
||||
[[autodoc]] YolosModel
|
||||
- forward
|
||||
|
||||
|
||||
## YolosForObjectDetection
|
||||
|
||||
[[autodoc]] YolosForObjectDetection
|
||||
- forward
|
||||
150
docs/source/en/perf_hardware.mdx
Normal file
150
docs/source/en/perf_hardware.mdx
Normal file
@ -0,0 +1,150 @@
|
||||
<!---
|
||||
Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
|
||||
# Custom hardware for training
|
||||
|
||||
The hardware you use to run model training and inference can have a big effect on performance. For a deep dive into GPUs make sure to check out Tim Dettmer's excellent [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/).
|
||||
|
||||
Let's have a look at some practical advice for GPU setups.
|
||||
|
||||
## GPU
|
||||
When you train bigger models you have essentially three options:
|
||||
- bigger GPUs
|
||||
- more GPUs
|
||||
- more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support))
|
||||
|
||||
Let's start at the case where you have a single GPU.
|
||||
|
||||
### Power and Cooling
|
||||
|
||||
If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling.
|
||||
|
||||
**Power**:
|
||||
|
||||
Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. Do not use the 2 splits at one end of the same cable (also known as pigtail cable). That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! You won't get the full performance out of your card otherwise.
|
||||
|
||||
Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power.
|
||||
|
||||
Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power.
|
||||
|
||||
Low end cards may use 6-Pin connectors, which supply up to 75W of power.
|
||||
|
||||
Additionally you want the high-end PSU that has stable voltage. Some lower quality ones may not give the card the stable voltage it needs to function at its peak.
|
||||
|
||||
And of course the PSU needs to have enough unused Watts to power the card.
|
||||
|
||||
**Cooling**:
|
||||
|
||||
When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot.
|
||||
|
||||
It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. The throttling down is likely to start at around 84-90C. But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU.
|
||||
|
||||
Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity.
|
||||
|
||||
### Multi-GPU Connectivity
|
||||
|
||||
If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. If the GPUs are on the same physical node, you can run:
|
||||
|
||||
```
|
||||
nvidia-smi topo -m
|
||||
```
|
||||
|
||||
and it will tell you how the GPUs are inter-connected. On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like:
|
||||
|
||||
```
|
||||
GPU0 GPU1 CPU Affinity NUMA Affinity
|
||||
GPU0 X NV2 0-23 N/A
|
||||
GPU1 NV2 X 0-23 N/A
|
||||
```
|
||||
|
||||
on a different machine w/o NVLink we may see:
|
||||
```
|
||||
GPU0 GPU1 CPU Affinity NUMA Affinity
|
||||
GPU0 X PHB 0-11 N/A
|
||||
GPU1 PHB X 0-11 N/A
|
||||
```
|
||||
|
||||
The report includes this legend:
|
||||
|
||||
```
|
||||
X = Self
|
||||
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
|
||||
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
|
||||
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
||||
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
|
||||
PIX = Connection traversing at most a single PCIe bridge
|
||||
NV# = Connection traversing a bonded set of # NVLinks
|
||||
```
|
||||
|
||||
So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup.
|
||||
|
||||
Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB).
|
||||
|
||||
Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training.
|
||||
|
||||
#### NVlink
|
||||
|
||||
[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia.
|
||||
|
||||
Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf):
|
||||
|
||||
> Third-Generation NVLink®
|
||||
> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links,
|
||||
> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four
|
||||
> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth
|
||||
> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink.
|
||||
> (Note that 3-Way and 4-Way SLI configurations are not supported.)
|
||||
|
||||
So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.
|
||||
|
||||
Let's compare the execution of a gpt2 language model training over a small sample of wikitext.
|
||||
|
||||
The results are:
|
||||
|
||||
|
||||
| NVlink | Time |
|
||||
| ----- | ---: |
|
||||
| Y | 101s |
|
||||
| N | 131s |
|
||||
|
||||
|
||||
You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink.
|
||||
|
||||
Here is the full benchmark code and outputs:
|
||||
|
||||
```bash
|
||||
# DDP w/ NVLink
|
||||
|
||||
rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
|
||||
--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
|
||||
--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
|
||||
--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
|
||||
|
||||
{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
|
||||
|
||||
# DDP w/o NVLink
|
||||
|
||||
rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
|
||||
--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
|
||||
--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
|
||||
--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
|
||||
|
||||
{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
|
||||
```
|
||||
|
||||
Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
|
||||
Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
|
||||
57
docs/source/en/perf_infer_cpu.mdx
Normal file
57
docs/source/en/perf_infer_cpu.mdx
Normal file
@ -0,0 +1,57 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
-->
|
||||
|
||||
# Efficient Inference on CPU
|
||||
|
||||
This guide focuses on inferencing large models efficiently on CPU.
|
||||
|
||||
## PyTorch JIT-mode (TorchScript)
|
||||
TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency.
|
||||
Comparing to default eager mode, jit mode in PyTorch normally yields better performance for model inference from optimization methodologies like operator fusion.
|
||||
|
||||
For a gentle introduction to TorchScript, see the Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
|
||||
|
||||
### IPEX Graph Optimization with JIT-mode
|
||||
Intel® Extension for PyTorch provides further optimizations in jit mode for Transformers series models. It is highly recommended for users to take advantage of Intel® Extension for PyTorch with jit mode. Some frequently used operator patterns from Transformers models are already supported in Intel® Extension for PyTorch with jit mode fusions. Those fusion patterns like Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. are enabled and perform well. The benefit of the fusion is delivered to users in a transparent fashion. According to the analysis, ~70% of most popular NLP tasks in question-answering, text-classification, and token-classification can get performance benefits with these fusion patterns for both Float32 precision and BFloat16 Mixed precision.
|
||||
|
||||
Check more detailed information for [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/1.11.200/tutorials/features/graph_optimization.html).
|
||||
|
||||
#### IPEX installation:
|
||||
|
||||
IPEX release is following PyTorch, check the approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/).
|
||||
|
||||
### Usage of JIT-mode
|
||||
To enable jit mode in Trainer, users should add `jit_mode_eval` in Trainer command arguments.
|
||||
|
||||
Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
|
||||
|
||||
- Inference using jit mode on CPU:
|
||||
<pre>python run_qa.py \
|
||||
--model_name_or_path csarron/bert-base-uncased-squad-v1 \
|
||||
--dataset_name squad \
|
||||
--do_eval \
|
||||
--max_seq_length 384 \
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/ \
|
||||
--no_cuda \
|
||||
<b>--jit_mode_eval </b></pre>
|
||||
|
||||
- Inference with IPEX using jit mode on CPU:
|
||||
<pre>python run_qa.py \
|
||||
--model_name_or_path csarron/bert-base-uncased-squad-v1 \
|
||||
--dataset_name squad \
|
||||
--do_eval \
|
||||
--max_seq_length 384 \
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/ \
|
||||
--no_cuda \
|
||||
<b>--use_ipex \</b>
|
||||
<b>--jit_mode_eval</b></pre>
|
||||
61
docs/source/en/perf_train_cpu.mdx
Normal file
61
docs/source/en/perf_train_cpu.mdx
Normal file
@ -0,0 +1,61 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
-->
|
||||
|
||||
# Efficient Training on CPU
|
||||
|
||||
This guide focuses on training large models efficiently on CPU.
|
||||
|
||||
## Mixed precision with IPEX
|
||||
|
||||
IPEX is optimized for CPUs with AVX-512 or above, and functionally works for CPUs with only AVX2. So, it is expected to bring performance benefit for Intel CPU generations with AVX-512 or above while CPUs with only AVX2 (e.g., AMD CPUs or older Intel CPUs) might result in a better performance under IPEX, but not guaranteed. IPEX provides performance optimizations for CPU training with both Float32 and BFloat16. The usage of BFloat16 is the main focus of the following sections.
|
||||
|
||||
Low precision data type BFloat16 has been natively supported on the 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) with AVX512 instruction set and will be supported on the next generation of Intel® Xeon® Scalable Processors with Intel® Advanced Matrix Extensions (Intel® AMX) instruction set with further boosted performance. The Auto Mixed Precision for CPU backend has been enabled since PyTorch-1.10. At the same time, the support of Auto Mixed Precision with BFloat16 for CPU and BFloat16 optimization of operators has been massively enabled in Intel® Extension for PyTorch, and partially upstreamed to PyTorch master branch. Users can get better performance and user experience with IPEX Auto Mixed Precision.
|
||||
|
||||
Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/1.11.200/tutorials/features/amp.html).
|
||||
|
||||
### IPEX installation:
|
||||
|
||||
IPEX release is following PyTorch, to install via pip:
|
||||
|
||||
For PyTorch-1.10:
|
||||
|
||||
```
|
||||
pip install intel_extension_for_pytorch==1.10.100+cpu -f https://software.intel.com/ipex-whl-stable
|
||||
```
|
||||
|
||||
For PyTorch-1.11:
|
||||
|
||||
```
|
||||
pip install intel_extension_for_pytorch==1.11.200+cpu -f https://software.intel.com/ipex-whl-stable
|
||||
```
|
||||
|
||||
Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/1.11.200/tutorials/installation.html).
|
||||
|
||||
### Usage in Trainer
|
||||
To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` and `no_cuda` in training command arguments.
|
||||
|
||||
Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
|
||||
|
||||
- Training with IPEX using BF16 auto mixed precision on CPU:
|
||||
<pre> python run_qa.py \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--dataset_name squad \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--per_device_train_batch_size 12 \
|
||||
--learning_rate 3e-5 \
|
||||
--num_train_epochs 2 \
|
||||
--max_seq_length 384 \
|
||||
--doc_stride 128 \
|
||||
--output_dir /tmp/debug_squad/ \
|
||||
<b>--use_ipex \</b>
|
||||
<b>--bf16 --no_cuda</b></pre>
|
||||
|
||||
@ -1,47 +1,157 @@
|
||||
<!---
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
-->
|
||||
|
||||
# Model Parallelism
|
||||
# Efficient Training on Multiple GPUs
|
||||
|
||||
|
||||
## Parallelism overview
|
||||
|
||||
In the modern machine learning the various approaches to parallelism are used to:
|
||||
1. fit very large models onto limited hardware - e.g. t5-11b is 45GB in just model params
|
||||
2. significantly speed up training - finish training that would take a year in hours
|
||||
When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a mutli-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
|
||||
|
||||
We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
|
||||
|
||||
While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
|
||||
|
||||
|
||||
## Concepts
|
||||
|
||||
The following is the brief description of the main concepts that will be described later in depth in this document.
|
||||
|
||||
1. DataParallel (DP) - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
|
||||
2. TensorParallel (TP) - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
|
||||
3. PipelineParallel (PP) - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
|
||||
4. Zero Redundancy Optimizer (ZeRO) - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
|
||||
5. Sharded DDP - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
|
||||
1. **DataParallel (DP)** - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
|
||||
2. **TensorParallel (TP)** - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
|
||||
3. **PipelineParallel (PP)** - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
|
||||
4. **Zero Redundancy Optimizer (ZeRO)** - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
|
||||
5. **Sharded DDP** - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
|
||||
|
||||
Before diving deeper into the specifics of each concept we first have a look at the rough decision process when training large models on a large infrastructure.
|
||||
|
||||
## Scalability Strategy
|
||||
|
||||
**⇨ Single Node / Multi-GPU**
|
||||
* Model fits onto a single GPU:
|
||||
|
||||
1. DDP - Distributed DP
|
||||
2. ZeRO - may or may not be faster depending on the situation and configuration used
|
||||
|
||||
* Model doesn't fit onto a single GPU:
|
||||
|
||||
1. PP
|
||||
2. ZeRO
|
||||
3. TP
|
||||
|
||||
With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
|
||||
|
||||
TP is almost always used within a single node. That is TP size <= gpus per node.
|
||||
|
||||
* Largest Layer not fitting into a single GPU:
|
||||
|
||||
1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
|
||||
2. With ZeRO see the same entry for "Single GPU" above
|
||||
|
||||
|
||||
**⇨ Multi-Node / Multi-GPU**
|
||||
|
||||
* When you have fast inter-node connectivity:
|
||||
|
||||
1. ZeRO - as it requires close to no modifications to the model
|
||||
2. PP+TP+DP - less communications, but requires massive changes to the model
|
||||
|
||||
* when you have slow inter-node connectivity and still low on GPU memory:
|
||||
|
||||
1. DP+PP+TP+ZeRO-1
|
||||
|
||||
|
||||
|
||||
## Data Parallelism
|
||||
|
||||
Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch.
|
||||
Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch. Note that in general it is advised to use DDP as it is better maintained and works for all models while DP might fail for some models. [PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) itself recommends the use of DDP.
|
||||
|
||||
### DP vs DDP
|
||||
|
||||
`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case:
|
||||
* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL
|
||||
* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP
|
||||
|
||||
Here are the main differences in the inter-GPU communication overhead between the two modes:
|
||||
|
||||
[DDP](https://pytorch.org/docs/master/notes/ddp.html):
|
||||
|
||||
- At the start time the main process replicates the model once from gpu 0 to the rest of gpus
|
||||
- Then for each batch:
|
||||
1. each gpu consumes each own mini-batch of data directly
|
||||
2. during `backward`, once the local gradients are ready, they are then averaged across all processes
|
||||
|
||||
[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):
|
||||
|
||||
For each batch:
|
||||
1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu
|
||||
2. replicates the up-to-date model from gpu 0 to each gpu
|
||||
3. runs `forward` and sends output from each gpu to gpu 0, computes loss
|
||||
4. scatters loss from gpu 0 to all gpus, runs `backward`
|
||||
5. sends gradients from each gpu to gpu 0 and averages those
|
||||
|
||||
The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch.
|
||||
|
||||
DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html).
|
||||
|
||||
Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus.
|
||||
|
||||
You can use DDP across multiple machines, but this is not the case with DP.
|
||||
|
||||
There are other differences between DP and DDP but they aren't relevant to this discussion.
|
||||
|
||||
If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know.
|
||||
|
||||
Let's look at an actual benchmark:
|
||||
|
||||
| Type | NVlink | Time |
|
||||
| :----- | ----- | ---: |
|
||||
| 2:DP | Y | 110s |
|
||||
| 2:DDP | Y | 101s |
|
||||
| 2:DDP | N | 131s |
|
||||
|
||||
|
||||
Analysis:
|
||||
|
||||
Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink
|
||||
|
||||
The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime.
|
||||
|
||||
Here is the full benchmark code and outputs:
|
||||
|
||||
`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark.
|
||||
|
||||
```
|
||||
|
||||
# DP
|
||||
rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
|
||||
python examples/pytorch/language-modeling/run_clm.py \
|
||||
--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
|
||||
|
||||
{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
|
||||
|
||||
# DDP w/ NVlink
|
||||
rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
|
||||
python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
|
||||
--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
|
||||
|
||||
{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
|
||||
|
||||
# DDP w/o NVlink
|
||||
rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
|
||||
python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
|
||||
--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
|
||||
--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
|
||||
|
||||
{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
|
||||
```
|
||||
|
||||
Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
|
||||
Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
|
||||
|
||||
## ZeRO Data Parallelism
|
||||
|
||||
@ -356,7 +466,7 @@ One very important aspect is that FlexFlow is designed for optimizing DNN parall
|
||||
|
||||
So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization.
|
||||
|
||||
🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/main/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
|
||||
🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
|
||||
|
||||
|
||||
## Which Strategy To Use When
|
||||
720
docs/source/en/perf_train_gpu_one.mdx
Normal file
720
docs/source/en/perf_train_gpu_one.mdx
Normal file
@ -0,0 +1,720 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
-->
|
||||
|
||||
# Efficient Training on a Single GPU
|
||||
|
||||
This guide focuses on training large models efficiently on a single GPU. These approaches are still valid if you have access to a machine with multiple GPUs but you will also have access to additional methods outlined in the [multi-GPU section](perf_train_gpu_many).
|
||||
|
||||
In this section we have a look at a few tricks to reduce the memory footprint and speed up training for large models and how they are integrated in the [`Trainer`] and [🤗 Accelerate](https://huggingface.co/docs/accelerate/). Each method can improve speed or memory usage which is summarized in the table below:
|
||||
|
||||
|Method|Speed|Memory|
|
||||
|:-----|:----|:-----|
|
||||
| Gradient accumulation | No | Yes |
|
||||
| Gradient checkpointing | No| Yes |
|
||||
| Mixed precision training | Yes | (No) |
|
||||
| Batch size | Yes | Yes |
|
||||
| Optimizer choice | Yes | Yes |
|
||||
| DataLoader | Yes | No |
|
||||
| DeepSpeed Zero | No | Yes |
|
||||
|
||||
A bracket means that it might not be strictly the case but is usually either not a main concern or negligable. Before we start make sure you have installed the following libraries:
|
||||
|
||||
```bash
|
||||
pip install transformers datasets accelerate nvidia-ml-py3
|
||||
```
|
||||
|
||||
The `nvidia-ml-py3` library allows us to monitor the memory usage of the models from within Python. You might be familiar with the `nvidia-smi` command in the terminal - this library allows to access the same information in Python directly.
|
||||
|
||||
Then we create some dummy data. We create random token IDs between 100 and 30000 and binary labels for a classifier. In total we get 512 sequences each with length 512 and store them in a [`Dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=dataset#datasets.Dataset) with PyTorch format.
|
||||
|
||||
|
||||
```py
|
||||
import numpy as np
|
||||
from datasets import Dataset
|
||||
|
||||
|
||||
seq_len, dataset_size = 512, 512
|
||||
dummy_data = {
|
||||
"input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
|
||||
"labels": np.random.randint(0, 1, (dataset_size)),
|
||||
}
|
||||
ds = Dataset.from_dict(dummy_data)
|
||||
ds.set_format("pt")
|
||||
```
|
||||
|
||||
We want to print some summary statistics for the GPU utilization and the training run with the [`Trainer`]. We setup a two helper functions to do just that:
|
||||
|
||||
```py
|
||||
from pynvml import *
|
||||
|
||||
|
||||
def print_gpu_utilization():
|
||||
nvmlInit()
|
||||
handle = nvmlDeviceGetHandleByIndex(0)
|
||||
info = nvmlDeviceGetMemoryInfo(handle)
|
||||
print(f"GPU memory occupied: {info.used//1024**2} MB.")
|
||||
|
||||
|
||||
def print_summary(result):
|
||||
print(f"Time: {result.metrics['train_runtime']:.2f}")
|
||||
print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
|
||||
print_gpu_utilization()
|
||||
```
|
||||
|
||||
Let's verify that we start with a free GPU memory:
|
||||
|
||||
```py
|
||||
>>> print_gpu_utilization()
|
||||
GPU memory occupied: 0 MB.
|
||||
```
|
||||
|
||||
That looks good: the GPU memory is not occupied as we would expect before we load any models. If that's not the case on your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by the user. When a model is loaded to the GPU also the kernels are loaded which can take up 1-2GB of memory. To see how much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well.
|
||||
|
||||
```py
|
||||
>>> import torch
|
||||
|
||||
|
||||
>>> torch.ones((1, 1)).to("cuda")
|
||||
>>> print_gpu_utilization()
|
||||
GPU memory occupied: 1343 MB.
|
||||
```
|
||||
|
||||
We see that the kernels alone take up 1.3GB of GPU memory. Now let's see how much space the model uses.
|
||||
|
||||
## Load Model
|
||||
|
||||
First, we load the `bert-large-uncased` model. We load the model weights directly to the GPU so that we can check how much space just weights use.
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForSequenceClassification
|
||||
|
||||
|
||||
>>> model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda")
|
||||
>>> print_gpu_utilization()
|
||||
GPU memory occupied: 2631 MB.
|
||||
```
|
||||
|
||||
We can see that the model weights alone take up 1.3 GB of the GPU memory. The exact number depends on the specific GPU you are using. Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an optimized fashion that speeds up the usage of the model. Now we can also quickly check if we get the same result as with `nvidia-smi` CLI:
|
||||
|
||||
|
||||
```bash
|
||||
nvidia-smi
|
||||
```
|
||||
|
||||
```bash
|
||||
Tue Jan 11 08:58:05 2022
|
||||
+-----------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 460.91.03 Driver Version: 460.91.03 CUDA Version: 11.2 |
|
||||
|-------------------------------+----------------------+----------------------+
|
||||
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|
||||
| | | MIG M. |
|
||||
|===============================+======================+======================|
|
||||
| 0 Tesla V100-SXM2... On | 00000000:00:04.0 Off | 0 |
|
||||
| N/A 37C P0 39W / 300W | 2631MiB / 16160MiB | 0% Default |
|
||||
| | | N/A |
|
||||
+-------------------------------+----------------------+----------------------+
|
||||
|
||||
+-----------------------------------------------------------------------------+
|
||||
| Processes: |
|
||||
| GPU GI CI PID Type Process name GPU Memory |
|
||||
| ID ID Usage |
|
||||
|=============================================================================|
|
||||
| 0 N/A N/A 3721 C ...nvs/codeparrot/bin/python 2629MiB |
|
||||
+-----------------------------------------------------------------------------+
|
||||
```
|
||||
|
||||
We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can start training the model and see how the GPU memory consumption changes. First, we set up a few standard training arguments that we will use across all our experiments:
|
||||
|
||||
```py
|
||||
default_args = {
|
||||
"output_dir": "tmp",
|
||||
"evaluation_strategy": "steps",
|
||||
"num_train_epochs": 1,
|
||||
"log_level": "error",
|
||||
"report_to": "none",
|
||||
}
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Note: In order to properly clear the memory after experiments we need restart the Python kernel between experiments. Run all steps above and then just one of the experiments below.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Vanilla Training
|
||||
|
||||
As a first experiment we will use the [`Trainer`] and train the model without any further modifications and a batch size of 4:
|
||||
|
||||
```py
|
||||
from transformers import TrainingArguments, Trainer, logging
|
||||
|
||||
logging.set_verbosity_error()
|
||||
|
||||
|
||||
training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 57.82
|
||||
Samples/second: 8.86
|
||||
GPU memory occupied: 14949 MB.
|
||||
```
|
||||
|
||||
We see that already a relatively small batch size almost fills up our GPU's entire memory. However, a larger batch size can often result in faster model convergence or better end performance. So ideally we want to tune the batch size to our model's needs and not to the GPU limitations. What's interesting is that we use much more memory than the size of the model. To understand a bit better why this is the case let's have look at a model's operations and memory needs.
|
||||
|
||||
## Anatomy of Model's Operations
|
||||
|
||||
Transformers architecture includes 3 main groups of operations grouped below by compute-intensity.
|
||||
|
||||
1. **Tensor Contractions**
|
||||
|
||||
Linear layers and components of Multi-Head Attention all do batched **matrix-matrix multiplications**. These operations are the most compute-intensive part of training a transformer.
|
||||
|
||||
2. **Statistical Normalizations**
|
||||
|
||||
Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more **reduction operations**, the result of which is then applied via a map.
|
||||
|
||||
3. **Element-wise Operators**
|
||||
|
||||
These are the remaining operators: **biases, dropout, activations, and residual connections**. These are the least compute-intensive operations.
|
||||
|
||||
This knowledge can be helpful to know when analyzing performance bottlenecks.
|
||||
|
||||
This summary is derived from [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072)
|
||||
|
||||
|
||||
## Anatomy of Model's Memory
|
||||
We've seen that training the model uses much more memory than just putting the model on the GPU. This is because there are many components during training that use GPU memory. The components on GPU memory are the following:
|
||||
1. model weights
|
||||
2. optimizer states
|
||||
3. gradients
|
||||
4. forward activations saved for gradient computation
|
||||
5. temporary buffers
|
||||
6. functionality-specific memory
|
||||
|
||||
A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. For inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per model parameter for mixed precision inference, plus activation memory.
|
||||
|
||||
Let's look at the details.
|
||||
|
||||
**Model Weights:**
|
||||
|
||||
- 4 bytes * number of parameters for fp32 training
|
||||
- 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
|
||||
|
||||
**Optimizer States:**
|
||||
|
||||
- 8 bytes * number of parameters for normal AdamW (maintains 2 states)
|
||||
- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/facebookresearch/bitsandbytes)
|
||||
- 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
|
||||
|
||||
**Gradients**
|
||||
|
||||
- 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
|
||||
|
||||
**Forward Activations**
|
||||
|
||||
- size depends on many factors, the key ones being sequence length, hidden size and batch size.
|
||||
|
||||
There are the input and output that are being passed and returned by the forward and the backward functions and the forward activations saved for gradient computation.
|
||||
|
||||
**Temporary Memory**
|
||||
|
||||
Additionally there are all kinds of temporary variables which get released once the calculation is done, but in the moment these could require additional memory and could push to OOM. Therefore when coding it's crucial to think strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed.
|
||||
|
||||
**Functionality-specific memory**
|
||||
|
||||
Then your software could have special memory needs. For example, when generating text using beam search, the software needs to maintain multiple copies of inputs and outputs.
|
||||
|
||||
**`forward` vs `backward` Execution Speed**
|
||||
|
||||
For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward (e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, and writes once, gradInput).
|
||||
|
||||
So there are potentially a few places where we could save GPU memory or speed up operations. Let's start with a simple optimization: choosing the right batch size.
|
||||
|
||||
## Batch sizes
|
||||
|
||||
One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.
|
||||
|
||||
For example for fully connected layers (which correspond to GEMMs), NVIDIA provides recommendations for [input/output neuron counts](
|
||||
https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and [batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size).
|
||||
|
||||
[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) define the multiplier based on the dtype and the hardware. For example, for fp16 a multiple of 8 is recommended, but on A100 it's 64!
|
||||
|
||||
For parameters that are small, there is also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) to consider, this is where tiling happens and the right multiplier can have a significant speedup.
|
||||
|
||||
## Gradient Accumulation
|
||||
|
||||
The idea behind gradient accumulation is to instead of calculating the gradients for the whole batch at once to do it in smaller steps. The way we do that is to calculate the gradients iteratively in smaller batches by doing a forward and backward pass through the model and accumulating the gradients in the process. When enough gradients are accumulated we run the model's optimization step. This way we can easily increase the overall batch size to numbers that would never fit into the GPU's memory. In turn, however, the added forward and backward passes can slow down the training a bit.
|
||||
|
||||
We can use gradient accumulation in the [`Trainer`] by simply adding the `gradient_accumulation_steps` argument to [`TrainingArguments`]. Let's see how it impacts the models memory footprint:
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 66.03
|
||||
Samples/second: 7.75
|
||||
GPU memory occupied: 8681 MB.
|
||||
```
|
||||
|
||||
We can see that the memory footprint was dramatically reduced at the cost of being only slightly slower than the vanilla run. Of course, this would change as you increase the number of accumulation steps. In general you would want to max out the GPU usage as much as possible. So in our case, the batch_size of 4 was already pretty close to the GPU's limit. If we wanted to train with a batch size of 64 we should not use `per_device_train_batch_size=1` and `gradient_accumulation_steps=64` but instead `per_device_train_batch_size=4` and `gradient_accumulation_steps=16` which has the same effective batch size while making better use of the available GPU resources.
|
||||
|
||||
For more details see the benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537)
|
||||
and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957).
|
||||
|
||||
Next we have a look at another trick to save a little bit more GPU memory called gradient checkpointing.
|
||||
|
||||
## Gradient Checkpointing
|
||||
|
||||
Even when we set the batch size to 1 and use gradient accumulation we can still run out of memory when working with large models. In order to compute the gradients during the backward pass all activations from the forward pass are normally saved. This can create a big memory overhead. Alternatively, one could forget all activations during the forward pass and recompute them on demand during the backward pass. This would however add a significant computational overhead and slow down training.
|
||||
|
||||
Gradient checkpointing strikes a compromise between the two approaches and saves strategically selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. See [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9) explaining the ideas behind gradient checkpointing.
|
||||
|
||||
To enable gradient checkpointing in the [`Trainer`] we only need ot pass it as a flag to the [`TrainingArguments`]. Everything else is handled under the hood:
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(
|
||||
per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
|
||||
)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 85.47
|
||||
Samples/second: 5.99
|
||||
GPU memory occupied: 6775 MB.
|
||||
```
|
||||
|
||||
We can see that this saved some more memory but at the same time training became a bit slower. A general rule of thumb is that gradient checkpointing slows down training by about 20%. Let's have a look at another method with which we can regain some speed: mixed precision training.
|
||||
|
||||
|
||||
## Floating Data Types
|
||||
|
||||
The idea of mixed precision training is that no all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variales and their computations are faster. Here are the commonly used floating point data types choice of which impacts both memory usage and throughput:
|
||||
|
||||
- fp32 (`float32`)
|
||||
- fp16 (`float16`)
|
||||
- bf16 (`bfloat16`)
|
||||
- tf32 (CUDA internal data type)
|
||||
|
||||
Here is a diagram that shows how these data types correlate to each other.
|
||||
|
||||

|
||||
(source: [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/))
|
||||
|
||||
While fp16 and fp32 have been around for quite some time, bf16 and tf32 are only available on the Ampere architecture GPUS and TPUs support bf16 as well. Let's start with the most commonly used method which is FP16 training/
|
||||
|
||||
|
||||
### FP16 Training
|
||||
|
||||
The idea of mixed precision training is that no all variables need to be stored in full (32-bit) floating point precision. If we can reduce the precision the variales and their computations are faster. The main advantage comes from saving the activations in half (16-bit) precision. Although the gradients are also computed in half precision they are converted back to full precision for the optimization step so no memory is saved here. Since the model is present on the GPU in both 16-bit and 32-bit precision this can use more GPU memory (1.5x the original model is on the GPU), especially for small batch sizes. Since some computations are performed in full and some in half precision this approach is also called mixed precision training. Enabling mixed precision training is also just a matter of setting the `fp16` flag to `True`:
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 27.46
|
||||
Samples/second: 18.64
|
||||
GPU memory occupied: 13939 MB.
|
||||
```
|
||||
|
||||
We can see that this is almost twice as fast as the vanilla training. Let's add it to the mix of the previous methods:
|
||||
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(
|
||||
per_device_train_batch_size=1,
|
||||
gradient_accumulation_steps=4,
|
||||
gradient_checkpointing=True,
|
||||
fp16=True,
|
||||
**default_args,
|
||||
)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 50.76
|
||||
Samples/second: 10.09
|
||||
GPU memory occupied: 7275 MB.
|
||||
```
|
||||
|
||||
We can see that with these tweaks we use about half the GPU memory as at the beginning while also being slightly faster.
|
||||
|
||||
### BF16
|
||||
If you have access to a Ampere or newer hardware you can use bf16 for your training and evaluation. While bf16 has a worse precision than fp16, it has a much much bigger dynamic range. Therefore, if in the past you were experiencing overflow issues while training the model, bf16 will prevent this from happening most of the time. Remember that in fp16 the biggest number you can have is `65535` and any number above that will overflow. A bf16 number can be as large as `3.39e+38` (!) which is about the same as fp32 - because both have 8-bits used for the numerical range.
|
||||
|
||||
You can enable BF16 in the 🤗 Trainer with:
|
||||
|
||||
```python
|
||||
TrainingArguments(bf16=True)
|
||||
```
|
||||
|
||||
### TF32
|
||||
The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total.
|
||||
|
||||
It's magical in the sense that you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput improvement. All you need to do is to add this to your code:
|
||||
|
||||
```
|
||||
import torch
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
```
|
||||
|
||||
When this is done CUDA will automatically switch to using tf32 instead of fp32 where it's possible. This, of course, assumes that the used GPU is from the Ampere series.
|
||||
|
||||
Like all cases with reduced precision this may or may not be satisfactory for your needs, so you have to experiment and see. According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) the majority of machine learning training shouldn't be impacted and showed the same perplexity and convergence as the fp32 training.
|
||||
|
||||
If you're already using fp16 or bf16 mixed precision it may help with the throughput as well.
|
||||
|
||||
You can enable this mode in the 🤗 Trainer with:
|
||||
```python
|
||||
TrainingArguments(tf32=True)
|
||||
```
|
||||
By default the PyTorch default is used.
|
||||
|
||||
Note: tf32 mode is internal to CUDA and can't be accessed directly via `tensor.to(dtype=torch.tf32)` as `torch.tf32` doesn't exit.
|
||||
|
||||
Note: you need `torch>=1.7` to enjoy this feature.
|
||||
|
||||
You can also see a variety of benchmarks on tf32 vs other precisions:
|
||||
[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and
|
||||
[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189).
|
||||
|
||||
We've now seen how we can change the floating types to increase throughput, but we are not done, yet! There is another area where we can save GPU memory: the optimizer.
|
||||
|
||||
## Optimizer
|
||||
|
||||
The most common optimizer used to train transformer model is Adam or AdamW (Adam with weight decay). Adam achieves good convergence by storing the rolling average of the previous gradients which, however, adds an additional memory footprint of the order of the number of model parameters. One remedy to this is to use an alternative optimizer such as Adafactor, which works well for some models but often it has instability issues.
|
||||
|
||||
HF Trainer integrates a variety of optimisers that can be used out of box. To activate the desired optimizer simply pass the `--optim` flag to the command line.
|
||||
|
||||
To see which optimizers are currently supported:
|
||||
|
||||
```bash
|
||||
$ python examples/pytorch/translation/run_translation.py -h | grep "\-optim"
|
||||
[--optim {adamw_hf,adamw_torch,adamw_torch_xla,adamw_apex_fused,adafactor}]
|
||||
```
|
||||
|
||||
For example, if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed `--optim adamw_apex_fused` will give you the fastest training experience among all supported AdamW optimizers.
|
||||
|
||||
On the other hand [8bit BNB optimizer](https://github.com/facebookresearch/bitsandbytes) can save 3/4 of memory normally used by a typical AdamW optimizer if it is configured to quantize all optimizer states, but in some situations only some optimizer states are quintized and then more memory is used. XXX: update once https://github.com/huggingface/transformers/pull/15622 is merged.
|
||||
|
||||
Let's get a feel for the numbers and use for example use a 3B-parameter model, like `t5-3b`. Note that since a Gigabyte correpsonds to a billion bytes we can simply multiply the parameters (in billions) with the number of necessary bytes per parameter to get Gigabytes of GPU memory usage:
|
||||
|
||||
- A standard AdamW uses 8 bytes for each parameter, here the optimizer will need (`8*3`) 24GB of GPU memory.
|
||||
- Adafactor uses slightly more than 4 bytes, so (`4*3`) 12GB and then some extra.
|
||||
- 8bit BNB quantized optimizer will use only (`2*3`) 6GB if all optimizer states are quantized.
|
||||
|
||||
Let's have a look at Adafactor first.
|
||||
|
||||
### Adafactor
|
||||
|
||||
Instead of keeping the rolling average for each element in the weight matrices Adafactor only stores aggregated information (row- and column-wise sums of the rolling averages) which reduces the footprint considerably. One downside of Adafactor is that in some instances convergence can be slower than Adam's so some experimentation is advised here. We can use Adafactor simply by setting `optim="adafactor"`:
|
||||
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 64.31
|
||||
Samples/second: 7.96
|
||||
GPU memory occupied: 12295 MB.
|
||||
```
|
||||
|
||||
We can see that this saves a few more GB on the GPU. Let's see how it looks when we add it to the other methods we introduced earlier:
|
||||
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(
|
||||
per_device_train_batch_size=1,
|
||||
gradient_accumulation_steps=4,
|
||||
gradient_checkpointing=True,
|
||||
fp16=True,
|
||||
optim="adafactor",
|
||||
**default_args,
|
||||
)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 56.54
|
||||
Samples/second: 9.06
|
||||
GPU memory occupied: 4847 MB.
|
||||
```
|
||||
|
||||
We went from 15 GB memory usage to 5 GB - a 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of Adafactor can be worse than Adam. There is an alternative to Adafactor called 8-bit Adam that takes a slightly different approach.
|
||||
|
||||
### 8-bit Adam
|
||||
|
||||
Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the idea behind FP16 training where using variables with lower precision saves memory.
|
||||
|
||||
In contrast to the previous approaches is this one not integrated into the [`Trainer`] as a simple flag. We need to install the 8-bit optimizer and then pass it as a custom optimizer to the [`Trainer`]. Follow the installation guide in the Github [repo](https://github.com/facebookresearch/bitsandbytes) to install the `bitsandbytes` library that implements the 8-bit Adam optimizer.
|
||||
|
||||
Once installed, we just need to initialize the the optimizer. Although this looks like a considerable amount of work it actually just involves two steps: first we need to group the model's parameters into two groups where to one group we apply weight decay and to the other we don't. Usually, biases and layer norm parameters are not weight decayed. Then in a second step we just do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.
|
||||
|
||||
<Tip>
|
||||
Note that in order to use the 8-bit optimizer with an existing pretrained model a change to the embedding layer is needed.
|
||||
Read [this issue](https://github.com/huggingface/transformers/issues/14819) for more information.
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
import bitsandbytes as bnb
|
||||
from torch import nn
|
||||
from transformers.trainer_pt_utils import get_parameter_names
|
||||
|
||||
training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
|
||||
|
||||
decay_parameters = get_parameter_names(model, [nn.LayerNorm])
|
||||
decay_parameters = [name for name in decay_parameters if "bias" not in name]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if n in decay_parameters],
|
||||
"weight_decay": training_args.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if n not in decay_parameters],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
|
||||
optimizer_kwargs = {
|
||||
"betas": (training_args.adam_beta1, training_args.adam_beta2),
|
||||
"eps": training_args.adam_epsilon,
|
||||
}
|
||||
optimizer_kwargs["lr"] = training_args.learning_rate
|
||||
adam_bnb_optim = bnb.optim.Adam8bit(
|
||||
optimizer_grouped_parameters,
|
||||
betas=(training_args.adam_beta1, training_args.adam_beta2),
|
||||
eps=training_args.adam_epsilon,
|
||||
lr=training_args.learning_rate,
|
||||
)
|
||||
```
|
||||
|
||||
We can now pass the custom optimizer as an argument to the `Trainer`:
|
||||
```py
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 55.95
|
||||
Samples/second: 9.15
|
||||
GPU memory occupied: 13085 MB.
|
||||
```
|
||||
|
||||
We can see that we get a similar memory improvement as with Adafactor while keeping the full rolling average of the gradients. Let's repeat the experiment with the full settings:
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(
|
||||
per_device_train_batch_size=1,
|
||||
gradient_accumulation_steps=4,
|
||||
gradient_checkpointing=True,
|
||||
fp16=True,
|
||||
**default_args,
|
||||
)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
|
||||
result = trainer.train()
|
||||
print_summary(result)
|
||||
```
|
||||
|
||||
```
|
||||
Time: 49.46
|
||||
Samples/second: 10.35
|
||||
GPU memory occupied: 5363 MB.
|
||||
```
|
||||
|
||||
Again, we get about a 3x memory improvement and even slightly higher throughput as using Adafactor. So we have seen how we can optimize the memory footprint of large models. The following plot summarizes all our experiments:
|
||||
|
||||

|
||||
|
||||
### `_multi_tensor`
|
||||
pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner and don't mind using the bleed-edge, see: https://github.com/huggingface/transformers/issues/9965
|
||||
|
||||
|
||||
## Using 🤗 Accelerate
|
||||
|
||||
So far we have used the [`Trainer`] to run the experiments but a more flexible alternative to that approach is to use 🤗 Accelerate. With 🤗 Accelerate you have full control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. In turn it allows you to easily scale across different infrastructures such as CPUs, GPUs, TPUs, or distributed multi-GPU setups without changing any code. Let's see what it takes to implement all of the above tweaks in 🤗 Accelerate. We can still use the [`TrainingArguments`] to wrap the training settings:
|
||||
|
||||
|
||||
```py
|
||||
training_args = TrainingArguments(
|
||||
per_device_train_batch_size=1,
|
||||
gradient_accumulation_steps=4,
|
||||
gradient_checkpointing=True,
|
||||
fp16=True,
|
||||
**default_args,
|
||||
)
|
||||
```
|
||||
|
||||
The full example training loop with 🤗 Accelerate is only a handful of lines of code long:
|
||||
|
||||
|
||||
```py
|
||||
from accelerate import Accelerator
|
||||
from torch.utils.data.dataloader import DataLoader
|
||||
|
||||
dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)
|
||||
|
||||
if training_args.gradient_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
accelerator = Accelerator(fp16=training_args.fp16)
|
||||
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)
|
||||
|
||||
model.train()
|
||||
for step, batch in enumerate(dataloader, start=1):
|
||||
loss = model(**batch).loss
|
||||
loss = loss / training_args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % training_args.gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator) we can specifiy if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. During the [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare) call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same 8-bit optimizer from the earlier experiments.
|
||||
|
||||
Finally, we can write the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see how gradient accumulation works: we normalize the loss so we get the average at the end of accumulation and once we have enough steps we run the optimization. Now the question is: does this use the same amount of memory as the previous steps? Let's check:
|
||||
|
||||
|
||||
```py
|
||||
>>> print_gpu_utilization()
|
||||
GPU memory occupied: 5363 MB.
|
||||
```
|
||||
|
||||
Indeed it does. Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the benefit of more flexiblity in the training loop. For a full documentation of all features have a look at the [Accelerate documentation](https://huggingface.co/docs/accelerate/index).
|
||||
|
||||
## DataLoader
|
||||
|
||||
One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it can handle. By default everything happens in the main process and it might not be able to read the data from disk fast enough, and thus create a bottleneck, leading to GPU under-utilization.
|
||||
|
||||
- `DataLoader(pin_memory=True, ...)` which ensures that the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
|
||||
- `DataLoader(num_workers=4, ...)` - spawn several workers to pre-load data faster - during training watch the GPU utilization stats and if it's far from 100% experiment with raising the number of workers. Of course, the problem could be elsewhere so a very big number of workers won't necessarily lead to a better performance.
|
||||
|
||||
## DeepSpeed ZeRO
|
||||
|
||||
The in-depth details on how to use Deepspeed can be found [here](main_classes/deepspeed).
|
||||
|
||||
First, a quick decision tree:
|
||||
|
||||
1. Model fits onto a single GPU and you have enough space to fit a small batch size - you don't need to use Deepspeed as it'll only slow things down in this use case.
|
||||
2. Model doesn't fit onto a single GPU or you can't fit a small batch - use DeepSpeed ZeRO + CPU Offload and for much larger models NVMe Offload.
|
||||
|
||||
Now if the decision tree suggested you use DeepSpeed first you need to [install it](main_classes/deepspeed#installation), then follow one of the following guides to create a configuration file and launch DeepSpeed.
|
||||
|
||||
Activation:
|
||||
|
||||
- HF Trainer-based examples: see this [guide](main_classes/deepspeed#deployment-with-one-gpu).
|
||||
- Custom HF Trainer-based program: Same as above, but pass:
|
||||
|
||||
```python
|
||||
TrainingArguments(deepspeed="/path/to/ds_config.json")
|
||||
```
|
||||
- Deployment in Notebooks: see this [guide](main_classes/deepspeed#deployment-in-notebooks).
|
||||
|
||||
- Custom training loop: This is somewhat complex but you can study how this is implemented in [HF Trainer](
|
||||
https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) - simply search for `deepspeed` in the code.
|
||||
|
||||
|
||||
## Choice of GPU
|
||||
Sometimes, even when applying all the above tweaks the throughput on a given GPU might still not be good enough. One easy solution is to change the type of GPU. For example switching from let's say a K80 (which you typically get on Google Colab) to a fancier GPU such as the V100 or A100. Although they are more expensive they are usually more cost effective than cheaper GPUs due to their larger memory and faster architecture.
|
||||
|
||||
Now, let's take a step back and discuss what we should optimize for when scaling the training of large models.
|
||||
|
||||
## How to scale
|
||||
|
||||
When we train models there are a two aspects we want to optimize at the same time:
|
||||
|
||||
- Data throughput/training time
|
||||
- Model performance
|
||||
|
||||
We have seen that each method changes the memory usage and throughput. In general we want to maximize the throughput (samples/second) to minimize the training cost. This is generally achieved by utilizing the GPU as much as possible and thus filling GPU memory to its limit. For example, as mentioned earlier, we only employ gradient accumulation when we want to use a batch size beyond the size of the GPU memory. If the desired batch size fits into memory then there is no reason to apply gradient accumulation which will only slow down training.
|
||||
|
||||
The second objective is model performance. Just because we can does not mean we should use a large batch size. As part of hyperparameter tuning you should determine which batch size yields the best result and then optimize the throughput accordingly.
|
||||
|
||||
|
||||
## Efficient Software Prebuilds
|
||||
|
||||
PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuit with the cuda toolkit which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.
|
||||
|
||||
At times it may take an additional effort to pre-build some components, e.g., if you're using libraries like `apex` that don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. To address these users' needs PyTorch and NVIDIA release a new version of NGC docker container which already comes with everything prebuilt and you just need to install your programs on it and it will run out of the box.
|
||||
|
||||
This approach is also useful if you want to tweak the pytorch source and/or make a new customized build.
|
||||
|
||||
To find the docker image version you want start [here](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch).
|
||||
|
||||
Next follow the instructions to download and deploy the docker image.
|
||||
|
||||
## Sparsity
|
||||
|
||||
### Mixture of Experts
|
||||
|
||||
Quite a few of the recent papers reported a 4-5x training speedup and a faster inference by integrating
|
||||
Mixture of Experts (MoE) into the Transformer models.
|
||||
|
||||
Since it has been discovered that more parameters lead to better performance, this technique allows to increase the number of parameters by an order of magnitude without increasing training costs.
|
||||
|
||||
In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function that trains each expert in a balanced way depending on the input token's position in a sequence.
|
||||
|
||||

|
||||
|
||||
(source: [GLAM](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html))
|
||||
|
||||
You can find exhaustive details and comparison tables in the papers listed at the end of this section.
|
||||
|
||||
The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.
|
||||
|
||||
There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the memory requirements moderately as well.
|
||||
|
||||
Most related papers and implementations are built around Tensorflow/TPUs:
|
||||
|
||||
- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668)
|
||||
- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
|
||||
- [GLaM: Generalist Language Model (GLaM)](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html)
|
||||
|
||||
And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts: [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](Thttps://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training).
|
||||
|
||||
|
||||
## Scaling beyond a single GPU
|
||||
|
||||
For some applications, such as pretraining large language models, applying all the approaches above might still not be fast enough. In this case you want to scale your experiment to several GPUs.
|
||||
|
||||
Another use case for training on many GPUs is if the model does not fit on a single GPU with all the mentioned tricks. There are still more methods we can apply although life starts to get a bit more complicated. This usually involves some form of pipeline or tensor parallelism where the model itself is distributed across several GPUs. One can also make use of DeepSpeed which implements some of these parallelism strategies along with some more optimization to reduce the memory footprint such as partitioning the optimizer states. You can read more about this in the ["Multi-GPU training" section](perf_train_gpu_many).
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -20,7 +20,7 @@ The [`pipeline`] makes it simple to use any model from the [Model Hub](https://h
|
||||
|
||||
<Tip>
|
||||
|
||||
Take a look at the [`pipeline`] documentation for a complete list of supported taska.
|
||||
Take a look at the [`pipeline`] documentation for a complete list of supported tasks.
|
||||
|
||||
</Tip>
|
||||
|
||||
@ -39,7 +39,9 @@ While each task has an associated [`pipeline`], it is simpler to use the general
|
||||
2. Pass your input text to the [`pipeline`]:
|
||||
|
||||
```py
|
||||
>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
|
||||
>>> generator(
|
||||
... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
|
||||
... ) # doctest: +SKIP
|
||||
[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
|
||||
```
|
||||
|
||||
@ -51,7 +53,7 @@ If you have more than one input, pass your input as a list:
|
||||
... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
|
||||
... "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
|
||||
... ]
|
||||
... )
|
||||
... ) # doctest: +SKIP
|
||||
```
|
||||
|
||||
Any additional parameters for your task can also be included in the [`pipeline`]. The `text-generation` task has a [`~generation_utils.GenerationMixin.generate`] method with several parameters for controlling the output. For example, if you want to generate more than one output, set the `num_return_sequences` parameter:
|
||||
@ -60,7 +62,7 @@ Any additional parameters for your task can also be included in the [`pipeline`]
|
||||
>>> generator(
|
||||
... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
|
||||
... num_return_sequences=2,
|
||||
... )
|
||||
... ) # doctest: +SKIP
|
||||
```
|
||||
|
||||
### Choose a model and tokenizer
|
||||
@ -85,7 +87,9 @@ Create a [`pipeline`] for your task, and specify the model and tokenizer you've
|
||||
Pass your input text to the [`pipeline`] to generate some text:
|
||||
|
||||
```py
|
||||
>>> generator("Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone")
|
||||
>>> generator(
|
||||
... "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
|
||||
... ) # doctest: +SKIP
|
||||
[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
|
||||
```
|
||||
|
||||
@ -93,7 +97,18 @@ Pass your input text to the [`pipeline`] to generate some text:
|
||||
|
||||
The flexibility of the [`pipeline`] means it can also be extended to audio tasks.
|
||||
|
||||
For example, let's classify the emotion from a short clip of John F. Kennedy's famous ["We choose to go to the Moon"](https://en.wikipedia.org/wiki/We_choose_to_go_to_the_Moon) speech. Find an [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) model on the Model Hub for emotion recognition and load it in the [`pipeline`]:
|
||||
For example, let's classify the emotion in this audio clip:
|
||||
|
||||
```py
|
||||
>>> from datasets import load_dataset
|
||||
>>> import torch
|
||||
|
||||
>>> torch.manual_seed(42) # doctest: +IGNORE_RESULT
|
||||
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
|
||||
>>> audio_file = ds[0]["audio"]["path"]
|
||||
```
|
||||
|
||||
Find an [audio classification](https://huggingface.co/models?pipeline_tag=audio-classification) model on the Model Hub for emotion recognition and load it in the [`pipeline`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import pipeline
|
||||
@ -106,12 +121,10 @@ For example, let's classify the emotion from a short clip of John F. Kennedy's f
|
||||
Pass the audio file to the [`pipeline`]:
|
||||
|
||||
```py
|
||||
>>> audio_classifier("jfk_moon_speech.wav")
|
||||
[{'label': 'calm', 'score': 0.13856211304664612},
|
||||
{'label': 'disgust', 'score': 0.13148026168346405},
|
||||
{'label': 'happy', 'score': 0.12635163962841034},
|
||||
{'label': 'angry', 'score': 0.12439591437578201},
|
||||
{'label': 'fearful', 'score': 0.12404385954141617}]
|
||||
>>> preds = audio_classifier(audio_file)
|
||||
>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
|
||||
>>> preds
|
||||
[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}]
|
||||
```
|
||||
|
||||
## Vision pipeline
|
||||
@ -126,14 +139,10 @@ Specify your vision task and pass your image to the classifier. The imaage can b
|
||||
>>> from transformers import pipeline
|
||||
|
||||
>>> vision_classifier = pipeline(task="image-classification")
|
||||
>>> vision_classifier(
|
||||
>>> preds = vision_classifier(
|
||||
... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||
... )
|
||||
[{'label': 'lynx, catamount', 'score': 0.4403027892112732},
|
||||
{'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
|
||||
'score': 0.03433405980467796},
|
||||
{'label': 'snow leopard, ounce, Panthera uncia',
|
||||
'score': 0.032148055732250214},
|
||||
{'label': 'Egyptian cat', 'score': 0.02353910356760025},
|
||||
{'label': 'tiger cat', 'score': 0.023034192621707916}]
|
||||
>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
|
||||
>>> preds
|
||||
[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
|
||||
```
|
||||
|
||||
@ -108,6 +108,7 @@ This checks that:
|
||||
- All objects added to the init are documented (performed by `utils/check_repo.py`)
|
||||
- All `__init__.py` files have the same content in their two sections (performed by `utils/check_inits.py`)
|
||||
- All code identified as a copy from another module is consistent with the original (performed by `utils/check_copies.py`)
|
||||
- All configuration classes have at least one valid checkpoint mentioned in their docstrings (performed by `utils/check_config_docstrings.py`)
|
||||
- The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`)
|
||||
- The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`)
|
||||
- The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`)
|
||||
|
||||
@ -158,14 +158,17 @@ Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for Tenso
|
||||
... "Don't think he knows about second breakfast, Pip.",
|
||||
... "What about elevensies?",
|
||||
... ]
|
||||
>>> encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
|
||||
>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
|
||||
>>> print(encoded_input)
|
||||
{'input_ids': tensor([[ 101, 153, 7719, 21490, 1122, 1114, 9582, 1623, 102],
|
||||
[ 101, 5226, 1122, 9649, 1199, 2610, 1236, 102, 0]]),
|
||||
'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0]]),
|
||||
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 1, 1, 1, 1, 1, 0]])}
|
||||
{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
|
||||
[101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
|
||||
[101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
|
||||
'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
|
||||
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
|
||||
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
@ -178,15 +181,18 @@ Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for Tenso
|
||||
>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
|
||||
>>> print(encoded_input)
|
||||
{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
|
||||
array([[ 101, 153, 7719, 21490, 1122, 1114, 9582, 1623, 102],
|
||||
[ 101, 5226, 1122, 9649, 1199, 2610, 1236, 102, 0]],
|
||||
array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
|
||||
[101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
|
||||
[101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
|
||||
dtype=int32)>,
|
||||
'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
|
||||
array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
|
||||
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
|
||||
'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
|
||||
array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)>}
|
||||
array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
|
||||
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
@ -118,20 +118,25 @@ Create a [`pipeline`] with the task you want to solve for and the model you want
|
||||
Next, load a dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) for more details) you'd like to iterate over. For example, let's load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset:
|
||||
|
||||
```py
|
||||
>>> from datasets import load_dataset
|
||||
>>> from datasets import load_dataset, Audio
|
||||
|
||||
>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") # doctest: +IGNORE_RESULT
|
||||
```
|
||||
|
||||
You can pass a whole dataset pipeline:
|
||||
We need to make sure that the sampling rate of the dataset matches the sampling
|
||||
rate `facebook/wav2vec2-base-960h` was trained on.
|
||||
|
||||
```py
|
||||
>>> files = dataset["path"]
|
||||
>>> speech_recognizer(files[:4])
|
||||
[{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'},
|
||||
{'text': "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE"},
|
||||
{'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"},
|
||||
{'text': 'HOW DO I FURN A JOINA COUT'}]
|
||||
>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
|
||||
```
|
||||
|
||||
Audio files are automatically loaded and resampled when calling the `"audio"` column.
|
||||
Let's extract the raw waveform arrays of the first 4 samples and pass it as a list to the pipeline:
|
||||
|
||||
```py
|
||||
>>> result = speech_recognizer(dataset[:4]["audio"])
|
||||
>>> print([d["text"] for d in result])
|
||||
['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I TURN A JOIN A COUNT']
|
||||
```
|
||||
|
||||
For a larger dataset where the inputs are big (like in speech or vision), you will want to pass along a generator instead of a list that loads all the inputs in memory. See the [pipeline documentation](./main_classes/pipelines) for more information.
|
||||
|
||||
@ -189,8 +189,9 @@ python run_summarization.py \
|
||||
|
||||
🤗 [Accelerate](https://huggingface.co/docs/accelerate/index.html) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it:
|
||||
|
||||
> Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts
|
||||
```bash
|
||||
pip install accelerate
|
||||
pip install git+https://github.com/huggingface/accelerate
|
||||
```
|
||||
|
||||
Instead of the `run_summarization.py` script, you need to use the `run_summarization_no_trainer.py` script. 🤗 Accelerate supported scripts will have a `task_no_trainer.py` file in the folder. Begin by running the following command to create and save a configuration file:
|
||||
|
||||
@ -50,10 +50,15 @@ Ready-made configurations include the following architectures:
|
||||
- BEiT
|
||||
- BERT
|
||||
- BigBird
|
||||
- BigBird-Pegasus
|
||||
- Blenderbot
|
||||
- BlenderbotSmall
|
||||
- CamemBERT
|
||||
- ConvBERT
|
||||
- ConvNeXT
|
||||
- Data2VecText
|
||||
- Data2VecVision
|
||||
- DeiT
|
||||
- DistilBERT
|
||||
- ELECTRA
|
||||
- FlauBERT
|
||||
@ -61,15 +66,21 @@ Ready-made configurations include the following architectures:
|
||||
- GPT-J
|
||||
- I-BERT
|
||||
- LayoutLM
|
||||
- LongT5
|
||||
- M2M100
|
||||
- Marian
|
||||
- mBART
|
||||
- MobileBERT
|
||||
- OpenAI GPT-2
|
||||
- Perceiver
|
||||
- PLBart
|
||||
- ResNet
|
||||
- RoBERTa
|
||||
- RoFormer
|
||||
- SqueezeBERT
|
||||
- T5
|
||||
- TAPEX
|
||||
- ViT
|
||||
- XLM
|
||||
- XLM-RoBERTa
|
||||
- XLM-RoBERTa-XL
|
||||
|
||||
@ -662,4 +673,4 @@ torch.neuron.trace(model, [token_tensor, segments_tensors])
|
||||
This change enables Neuron SDK to trace the model and optimize it to run in Inf1 instances.
|
||||
|
||||
To learn more about AWS Neuron SDK features, tools, example tutorials and latest updates,
|
||||
please see the [AWS NeuronSDK documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
|
||||
please see the [AWS NeuronSDK documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
|
||||
@ -10,5 +10,5 @@ notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
|
||||
black_avoid_patterns = {
|
||||
"{processor_class}": "FakeProcessorClass",
|
||||
"{model_class}": "FakeModelClass",
|
||||
"{object_class}": "FakeObjectClass",
|
||||
"{object_class}": "FakeObjectClass",
|
||||
}
|
||||
|
||||
@ -1,17 +1,44 @@
|
||||
- sections:
|
||||
- sections:
|
||||
- local: index
|
||||
title: 🤗 Transformers
|
||||
- local: quicktour
|
||||
title: Quick tour
|
||||
title: Tour rápido
|
||||
- local: installation
|
||||
title: Instalación
|
||||
title: Get started
|
||||
title: Empezar
|
||||
- sections:
|
||||
- local: pipeline_tutorial
|
||||
title: Pipelines para inferencia
|
||||
- local: autoclass_tutorial
|
||||
title: Carga instancias preentrenadas con un AutoClass
|
||||
- local: preprocessing
|
||||
title: Preprocesamiento
|
||||
- local: training
|
||||
title: Fine-tuning a un modelo pre-entrenado
|
||||
- local: accelerate
|
||||
title: Entrenamiento distribuido con 🤗 Accelerate
|
||||
title: Tutorials
|
||||
- local: model_sharing
|
||||
title: Compartir un modelo
|
||||
title: Tutoriales
|
||||
- sections:
|
||||
- local: fast_tokenizers
|
||||
title: Usa tokenizadores de 🤗 Tokenizers
|
||||
- local: create_a_model
|
||||
title: Crea una arquitectura personalizada
|
||||
- sections:
|
||||
- local: tasks/language_modeling
|
||||
title: Modelado de lenguaje
|
||||
- local: tasks/image_classification
|
||||
title: Clasificación de imágenes
|
||||
title: Fine-tuning para tareas posteriores
|
||||
- local: sagemaker
|
||||
title: Ejecutar el entrenamiento en Amazon SageMaker
|
||||
- local: multilingual
|
||||
title: Modelos multilingües para inferencia
|
||||
title: Modelos multilingües para inferencia
|
||||
title: Guías prácticas
|
||||
- sections:
|
||||
- local: philosophy
|
||||
title: Filosofía
|
||||
- local: bertology
|
||||
title: BERTología
|
||||
title: Guías conceptuales
|
||||
@ -107,7 +107,7 @@ Una vez que hayas añadido las líneas de código relevantes, inicia el entrenam
|
||||
|
||||
### Entrenar con un script
|
||||
|
||||
Si estas corriendo tu entrenamiento desde un script ejecuta el siguiente comando para crear y guardar un archivo de configuración:
|
||||
Si estás corriendo tu entrenamiento desde un script ejecuta el siguiente comando para crear y guardar un archivo de configuración:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
|
||||
119
docs/source/es/autoclass_tutorial.mdx
Normal file
119
docs/source/es/autoclass_tutorial.mdx
Normal file
@ -0,0 +1,119 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Carga instancias preentrenadas con un AutoClass
|
||||
|
||||
Con tantas arquitecturas diferentes de Transformer puede ser retador crear una para tu checkpoint. Como parte de la filosofía central de 🤗 Transformers para hacer que la biblioteca sea fácil, simple y flexible de usar; una `AutoClass` automáticamente infiere y carga la arquitectura correcta desde un checkpoint dado. El método `from_pretrained` te permite cargar rápidamente un modelo preentrenado para cualquier arquitectura, por lo que no tendrás que dedicar tiempo y recursos para entrenar uno desde cero. Producir este tipo de código con checkpoint implica que si funciona con uno, funcionará también con otro (siempre que haya sido entrenado para una tarea similar) incluso si la arquitectura es distinta.
|
||||
|
||||
<Tip>
|
||||
|
||||
Recuerda, la arquitectura se refiere al esqueleto del modelo y los checkpoints son los pesos para una arquitectura dada. Por ejemplo, [BERT](https://huggingface.co/bert-base-uncased) es una arquitectura, mientras que `bert-base-uncased` es un checkpoint. Modelo es un término general que puede significar una arquitectura o un checkpoint.
|
||||
|
||||
</Tip>
|
||||
|
||||
En este tutorial, aprenderás a:
|
||||
|
||||
* Cargar un tokenizador pre-entrenado.
|
||||
* Cargar un extractor de características (feature extractor en inglés) pre-entrenado.
|
||||
* Cargar un procesador pre-entrenado.
|
||||
* Cargar un modelo pre-entrenado.
|
||||
|
||||
## AutoTokenizer
|
||||
|
||||
Casi cualquier tarea de Procesamiento de Lenguaje Natural comienza con un tokenizador. Un tokenizador convierte tu input a un formato que puede ser procesado por el modelo.
|
||||
|
||||
Carga un tokenizador con [`AutoTokenizer.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoTokenizer
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
|
||||
Luego tokeniza tu input como lo mostrado a continuación:
|
||||
|
||||
```py
|
||||
>>> sequence = "In a hole in the ground there lived a hobbit."
|
||||
>>> print(tokenizer(sequence))
|
||||
{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102],
|
||||
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
|
||||
```
|
||||
|
||||
## AutoFeatureExtractor
|
||||
|
||||
Para tareas de audio y visión, un extractor de características procesa la señal de audio o imagen al formato de input correcto.
|
||||
|
||||
Carga un extractor de características con [`AutoFeatureExtractor.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoFeatureExtractor
|
||||
|
||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
|
||||
... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
|
||||
... )
|
||||
```
|
||||
|
||||
## AutoProcessor
|
||||
|
||||
Las tareas multimodales requieren un procesador que combine dos tipos de herramientas de preprocesamiento. Por ejemplo, el modelo [LayoutLMV2](model_doc/layoutlmv2) requiere que un extractor de características maneje las imágenes y que un tokenizador maneje el texto; un procesador combina ambas.
|
||||
|
||||
Carga un procesador con [`AutoProcessor.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoProcessor
|
||||
|
||||
>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
```
|
||||
|
||||
## AutoModel
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Finalmente, las clases `AutoModelFor` te permiten cargar un modelo preentrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, cargue un modelo para clasificación de secuencias con [`AutoModelForSequenceClassification.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForSequenceClassification
|
||||
|
||||
>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForTokenClassification
|
||||
|
||||
>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Generalmente recomendamos utilizar las clases `AutoTokenizer` y `AutoModelFor` para cargar instancias pre-entrenadas de modelos. Ésto asegurará que cargues la arquitectura correcta en cada ocasión. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
|
||||
</pt>
|
||||
<tf>
|
||||
Finalmente, la clase `TFAutoModelFor` te permite cargar tu modelo pre-entrenado para una tarea dada (revisa [aquí](model_doc/auto) para conocer la lista completa de tareas disponibles). Por ejemplo, carga un modelo para clasificación de secuencias con [`TFAutoModelForSequenceClassification.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForSequenceClassification
|
||||
|
||||
>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Reutiliza fácilmente el mismo checkpoint para cargar una aquitectura para alguna tarea diferente:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFAutoModelForTokenClassification
|
||||
|
||||
>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Generalmente recomendamos utilizar las clases `AutoTokenizer` y `TFAutoModelFor` para cargar instancias de modelos pre-entrenados. Ésto asegurará que cargues la arquitectura correcta cada vez. En el siguiente [tutorial](preprocessing), aprende a usar tu tokenizador recién cargado, el extractor de características y el procesador para preprocesar un dataset para fine-tuning.
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
36
docs/source/es/bertology.mdx
Normal file
36
docs/source/es/bertology.mdx
Normal file
@ -0,0 +1,36 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# BERTología
|
||||
|
||||
Hay un creciente campo de estudio empeñado en la investigación del funcionamiento interno de los transformers de gran escala como BERT
|
||||
(que algunos llaman "BERTología"). Algunos buenos ejemplos de este campo son:
|
||||
|
||||
|
||||
- BERT Rediscovers the Classical NLP Pipeline por Ian Tenney, Dipanjan Das, Ellie Pavlick:
|
||||
https://arxiv.org/abs/1905.05950
|
||||
- Are Sixteen Heads Really Better than One? por Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
|
||||
- What Does BERT Look At? An Analysis of BERT's Attention por Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
|
||||
Manning: https://arxiv.org/abs/1906.04341
|
||||
|
||||
Para asistir al desarrollo de este nuevo campo, hemos incluido algunas features adicionales en los modelos BERT/GPT/GPT-2 para
|
||||
ayudar a acceder a las representaciones internas, principalmente adaptado de la gran obra de Paul Michel
|
||||
(https://arxiv.org/abs/1905.10650):
|
||||
|
||||
|
||||
- accediendo a todos los hidden-states de BERT/GPT/GPT-2,
|
||||
- accediendo a todos los pesos de atención para cada head de BERT/GPT/GPT-2,
|
||||
- adquiriendo los valores de salida y gradientes de las heads para poder computar la métrica de importancia de las heads y realizar la poda de heads como se explica
|
||||
en https://arxiv.org/abs/1905.10650.
|
||||
|
||||
Para ayudarte a entender y usar estas features, hemos añadido un script específico de ejemplo: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) mientras extraes información y cortas un modelo pre-entrenado en
|
||||
GLUE.
|
||||
367
docs/source/es/create_a_model.mdx
Normal file
367
docs/source/es/create_a_model.mdx
Normal file
@ -0,0 +1,367 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Crea una arquitectura personalizada
|
||||
|
||||
Una [`AutoClass`](model_doc/auto) infiere, automáticamente, la arquitectura del modelo y descarga la configuración y los pesos del modelo preentrenado. Normalmente, recomendamos usar una `AutoClass` para producir un código agnóstico a puntos de guardado o checkpoints. Sin embargo, los usuarios que quieran más control sobre los parámetros específicos de los modelos pueden crear su propio modelo 🤗 Transformers personalizado a partir de varias clases base. Esto puede ser particularmente útil para alguien que esté interesado en estudiar, entrenar o experimentar con modelos 🤗 Transformers. En esta guía vamos a profundizar en la creación de modelos personalizados sin usar `AutoClass`. Aprenderemos a:
|
||||
|
||||
- Cargar y personalizar una configuración para un modelo.
|
||||
- Crear una arquitectura para un modelo.
|
||||
- Crear tokenizadores rápidos y lentos para textos.
|
||||
- Crear un extractor de propiedades para tareas de audio o imágenes.
|
||||
- Crear un procesador para tareas multimodales.
|
||||
|
||||
## Configuración
|
||||
|
||||
Una [configuración](main_classes/configuration) es un conjunto de atributos específicos de un modelo. Cada configuración de modelo tiene atributos diferentes. Por ejemplo, todos los modelos de PLN tienen los atributos `hidden_size`, `num_attention_heads`, `num_hidden_layers` y `vocab_size` en común. Estos atributos especifican el número de cabezas de atención o de capas ocultas con las que se construyen los modelos.
|
||||
|
||||
Puedes echarle un vistazo a [DistilBERT](model_doc/distilbert) y sus atributos accediendo a [`DistilBertConfig`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import DistilBertConfig
|
||||
|
||||
>>> config = DistilBertConfig()
|
||||
>>> print(config)
|
||||
DistilBertConfig {
|
||||
"activation": "gelu",
|
||||
"attention_dropout": 0.1,
|
||||
"dim": 768,
|
||||
"dropout": 0.1,
|
||||
"hidden_dim": 3072,
|
||||
"initializer_range": 0.02,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "distilbert",
|
||||
"n_heads": 12,
|
||||
"n_layers": 6,
|
||||
"pad_token_id": 0,
|
||||
"qa_dropout": 0.1,
|
||||
"seq_classif_dropout": 0.2,
|
||||
"sinusoidal_pos_embds": false,
|
||||
"transformers_version": "4.16.2",
|
||||
"vocab_size": 30522
|
||||
}
|
||||
```
|
||||
|
||||
[`DistilBertConfig`] muestra todos los atributos por defecto que se han usado para construir un modelo [`DistilBertModel`] base. Todos ellos son personalizables, lo que deja espacio para poder experimentar. Por ejemplo, puedes personalizar un modelo predeterminado para:
|
||||
|
||||
- Probar una función de activación diferente, usando el parámetro `activation`.
|
||||
- Usar un valor de abandono (también conocido como _dropout_) más alto para las probabilidades de las capas de atención, usando el parámetro `attention_dropout`.
|
||||
|
||||
```py
|
||||
>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
|
||||
>>> print(my_config)
|
||||
DistilBertConfig {
|
||||
"activation": "relu",
|
||||
"attention_dropout": 0.4,
|
||||
"dim": 768,
|
||||
"dropout": 0.1,
|
||||
"hidden_dim": 3072,
|
||||
"initializer_range": 0.02,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "distilbert",
|
||||
"n_heads": 12,
|
||||
"n_layers": 6,
|
||||
"pad_token_id": 0,
|
||||
"qa_dropout": 0.1,
|
||||
"seq_classif_dropout": 0.2,
|
||||
"sinusoidal_pos_embds": false,
|
||||
"transformers_version": "4.16.2",
|
||||
"vocab_size": 30522
|
||||
}
|
||||
```
|
||||
|
||||
Los atributos de los modelos preentrenados pueden ser modificados con la función [`~PretrainedConfig.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
|
||||
```
|
||||
|
||||
Cuando estés satisfecho con la configuración de tu modelo, puedes guardarlo con la función [`~PretrainedConfig.save_pretrained`]. Tu configuración se guardará en un archivo JSON dentro del directorio que le especifiques como parámetro.
|
||||
|
||||
```py
|
||||
>>> my_config.save_pretrained(save_directory="./your_model_save_path")
|
||||
```
|
||||
|
||||
Para volver a usar el archivo de configuración, puedes cargarlo usando [`~PretrainedConfig.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
También puedes guardar los archivos de configuración como un diccionario; o incluso guardar solo la diferencia entre tu archivo personalizado y la configuración por defecto. Consulta la [documentación sobre configuración](main_classes/configuration) para ver más detalles.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Modelo
|
||||
|
||||
El siguiente paso será crear un [modelo](main_classes/models). El modelo, al que a veces también nos referimos como arquitectura, es el encargado de definir cada capa y qué operaciones se realizan. Los atributos como `num_hidden_layers` de la configuración se usan para definir la arquitectura. Todos los modelos comparten una clase base, [`PreTrainedModel`], y algunos métodos comunes que se pueden usar para redimensionar los _embeddings_ o para recortar cabezas de auto-atención (también llamadas _self-attention heads_). Además, todos los modelos son subclases de [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) o [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module), lo que significa que son compatibles con su respectivo framework.
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma:
|
||||
|
||||
```py
|
||||
>>> from transformers import DistilBertModel
|
||||
|
||||
>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
|
||||
>>> model = DistilBertModel(my_config)
|
||||
```
|
||||
|
||||
Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentramiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo una fracción de los recursos que un entrenamiento completo hubiera requerido.
|
||||
|
||||
Puedes crear un modelo preentrenado con [`~PreTrainedModel.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Cuando cargues tus pesos del preentramiento, el modelo por defecto se carga automáticamente si nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
|
||||
|
||||
```py
|
||||
>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
Carga los atributos de tu configuración personalizada en el modelo de la siguiente forma:
|
||||
|
||||
```py
|
||||
>>> from transformers import TFDistilBertModel
|
||||
|
||||
>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
|
||||
>>> tf_model = TFDistilBertModel(my_config)
|
||||
```
|
||||
|
||||
Esto crea un modelo con valores aleatorios, en lugar de crearlo con los pesos del preentramiento, por lo que no serás capaz de usar este modelo para nada útil hasta que no lo entrenes. El entrenamiento es un proceso costoso, tanto en cuestión de recursos como de tiempo, por lo que generalmente es mejor usar un modelo preentrenado para obtener mejores resultados más rápido, consumiendo solo una fracción de los recursos que un entrenamiento completo hubiera requerido.
|
||||
|
||||
Puedes crear un modelo preentrenado con [`~TFPreTrainedModel.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Cuando cargues tus pesos del preentramiento, el modelo por defecto se carga automáticamente si este nos lo proporciona 🤗 Transformers. Sin embargo, siempre puedes reemplazar (todos o algunos de) los atributos del modelo por defecto por los tuyos:
|
||||
|
||||
```py
|
||||
>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
### Cabezas de modelo
|
||||
|
||||
En este punto del tutorial, tenemos un modelo DistilBERT base que devuelve los *hidden states* o estados ocultos. Los *hidden states* se pasan como parámetros de entrada a la cabeza del modelo para producir la salida. 🤗 Transformers ofrece una cabeza de modelo diferente para cada tarea, siempre y cuando el modelo sea compatible para la tarea (por ejemplo, no puedes usar DistilBERT para una tarea secuencia a secuencia como la traducción).
|
||||
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
Por ejemplo, [`DistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas.
|
||||
|
||||
```py
|
||||
>>> from transformers import DistilBertForSequenceClassification
|
||||
|
||||
>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`DistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*.
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import DistilBertForQuestionAnswering
|
||||
|
||||
>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
Por ejemplo, [`TFDistilBertForSequenceClassification`] es un modelo DistilBERT base con una cabeza de clasificación de secuencias. La cabeza de clasificación de secuencias es una capa superior que precede a la recolección de las salidas.
|
||||
|
||||
```py
|
||||
>>> from transformers import TFDistilBertForSequenceClassification
|
||||
|
||||
>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Puedes reutilizar este punto de guardado o *checkpoint* para otra tarea fácilmente cambiando a una cabeza de un modelo diferente. Para una tarea de respuesta a preguntas, puedes usar la cabeza del modelo [`TFDistilBertForQuestionAnswering`]. La cabeza de respuesta a preguntas es similar a la de clasificación de secuencias, excepto porque consta de una capa lineal delante de la salida de los *hidden states*.
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import TFDistilBertForQuestionAnswering
|
||||
|
||||
>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## Tokenizer
|
||||
|
||||
La ultima clase base que debes conocer antes de usar un modelo con datos textuales es la clase [tokenizer](main_classes/tokenizer), que convierte el texto bruto en tensores. Hay dos tipos de *tokenizers* que puedes usar con 🤗 Transformers:
|
||||
|
||||
- [`PreTrainedTokenizer`]: una implementación de un *tokenizer* hecha en Python.
|
||||
- [`PreTrainedTokenizerFast`]: un *tokenizer* de nuestra librería [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/), basada en Rust. Este tipo de *tokenizer* es bastante más rápido, especialmente durante la tokenización por lotes, gracias a estar implementado en Rust. Esta rápida tokenización también ofrece métodos adicionales como el *offset mapping*, que relaciona los tokens con sus palabras o caracteres originales.
|
||||
|
||||
Ambos *tokenizers* son compatibles con los métodos comunes, como los de encodificación y decodificación, los métodos para añadir tokens y aquellos que manejan tokens especiales.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
No todos los modelos son compatibles con un *tokenizer* rápido. Échale un vistazo a esta [tabla](index#supported-frameworks) para comprobar si un modelo en específico es compatible con un *tokenizer* rápido.
|
||||
|
||||
</Tip>
|
||||
|
||||
Si has entrenado tu propio *tokenizer*, puedes crear uno desde tu archivo de “vocabulario”:
|
||||
|
||||
```py
|
||||
>>> from transformers import DistilBertTokenizer
|
||||
|
||||
>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
|
||||
```
|
||||
|
||||
Es importante recordar que los vocabularios que provienen de un *tokenizer* personalizado serán diferentes a los vocabularios generados por el *tokenizer* de un modelo preentrenado. Debes usar el vocabulario de un *tokenizer* preentrenado si vas a usar un modelo preentrenado, de lo contrario las entradas no tendrán sentido. Crea un *tokenizer* con el vocabulario de un modelo preentrenado usado la clase [`DistilBertTokenizer`]:
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import DistilBertTokenizer
|
||||
|
||||
>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
Crea un *tokenizer* rápido con la clase [`DistilBertTokenizerFast`]:
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import DistilBertTokenizerFast
|
||||
|
||||
>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Por defecto, el [`AutoTokenizer`] intentará cargar un *tokenizer* rápido. Puedes desactivar este compartimiento cambiando el parámetro `use_fast=False` de `from_pretrained`.
|
||||
|
||||
|
||||
</Tip>
|
||||
|
||||
## Extractor de Características
|
||||
|
||||
Un extractor de características procesa entradas de audio e imagen. Hereda de la clase base [`~feature_extraction_utils.FeatureExtractionMixin`] y también puede heredar de la clase [`ImageFeatureExtractionMixin`] para el procesamiento de características de las imágenes o de la clase [`SequenceFeatureExtractor`] para el procesamiento de entradas de audio.
|
||||
|
||||
Dependiendo de si trabajas en una tarea de audio o de video, puedes crear un extractor de características asociado al modelo que estes usando. Por ejemplo, podrías crear un [`ViTFeatureExtractor`] por defecto si estas usando [ViT](model_doc/vit) para clasificación de imágenes:
|
||||
|
||||
```py
|
||||
>>> from transformers import ViTFeatureExtractor
|
||||
|
||||
>>> vit_extractor = ViTFeatureExtractor()
|
||||
>>> print(vit_extractor)
|
||||
ViTFeatureExtractor {
|
||||
"do_normalize": true,
|
||||
"do_resize": true,
|
||||
"feature_extractor_type": "ViTFeatureExtractor",
|
||||
"image_mean": [
|
||||
0.5,
|
||||
0.5,
|
||||
0.5
|
||||
],
|
||||
"image_std": [
|
||||
0.5,
|
||||
0.5,
|
||||
0.5
|
||||
],
|
||||
"resample": 2,
|
||||
"size": 224
|
||||
}
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
Si no estás buscando ninguna personalización en específico, usa el método `from_pretrained` para cargar los parámetros del extractor de características por defecto del modelo.
|
||||
|
||||
</Tip>
|
||||
|
||||
Puedes modificar cualquier parámetro de [`ViTFeatureExtractor`] para crear tu extractor de características personalizado:
|
||||
|
||||
```py
|
||||
>>> from transformers import ViTFeatureExtractor
|
||||
|
||||
>>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
|
||||
>>> print(my_vit_extractor)
|
||||
ViTFeatureExtractor {
|
||||
"do_normalize": false,
|
||||
"do_resize": true,
|
||||
"feature_extractor_type": "ViTFeatureExtractor",
|
||||
"image_mean": [
|
||||
0.3,
|
||||
0.3,
|
||||
0.3
|
||||
],
|
||||
"image_std": [
|
||||
0.5,
|
||||
0.5,
|
||||
0.5
|
||||
],
|
||||
"resample": "PIL.Image.BOX",
|
||||
"size": 224
|
||||
}
|
||||
```
|
||||
|
||||
Para las entradas de audio, puedes crear un [`Wav2Vec2FeatureExtractor`] y personalizar los parámetros de una forma similar:
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import Wav2Vec2FeatureExtractor
|
||||
|
||||
>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
|
||||
>>> print(w2v2_extractor)
|
||||
Wav2Vec2FeatureExtractor {
|
||||
"do_normalize": true,
|
||||
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
||||
"feature_size": 1,
|
||||
"padding_side": "right",
|
||||
"padding_value": 0.0,
|
||||
"return_attention_mask": false,
|
||||
"sampling_rate": 16000
|
||||
}
|
||||
```
|
||||
|
||||
## Procesador
|
||||
|
||||
Para modelos que son compatibles con tareas multimodales, 🤗 Transformers ofrece una clase *procesador* que agrupa un extractor de características y un *tokenizer* en el mismo objeto. Por ejemplo, probemos a usar el procesador [`Wav2Vec2Processor`] para una tarea de reconocimiento de voz (ASR). Un ASR transcribe el audio a texto, por lo que necesitaremos un extractor de características y un *tokenizer*.
|
||||
|
||||
Crea un extractor de características para manejar la entrada de audio:
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import Wav2Vec2FeatureExtractor
|
||||
|
||||
>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
|
||||
```
|
||||
|
||||
Crea un *tokenizer* para manejar la entrada de texto:
|
||||
|
||||
```py
|
||||
>>> from transformers import Wav2Vec2CTCTokenizer
|
||||
|
||||
>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
|
||||
```
|
||||
|
||||
Puedes combinar el extractor de características y el *tokenizer* en el [`Wav2Vec2Processor`]:
|
||||
|
||||
|
||||
```py
|
||||
>>> from transformers import Wav2Vec2Processor
|
||||
|
||||
>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
```
|
||||
Con dos clases base (la configuración y el modelo) y una clase de preprocesamiento adicional (*tokenizer*, extractor de características o procesador), puedes crear cualquiera de los modelos compatibles con 🤗 Transformers. Cada una de estas clases son configurables, permitiéndote usar sus atributos específicos. Puedes crear un modelo para entrenarlo de una forma fácil, o modificar un modelo preentrenado disponible para especializarlo.
|
||||
70
docs/source/es/fast_tokenizers.mdx
Normal file
70
docs/source/es/fast_tokenizers.mdx
Normal file
@ -0,0 +1,70 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Usa los tokenizadores de 🤗 Tokenizers
|
||||
|
||||
[`PreTrainedTokenizerFast`] depende de la biblioteca [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). Los tokenizadores obtenidos desde la biblioteca 🤗 Tokenizers pueden ser
|
||||
cargados de forma muy sencilla en los 🤗 Transformers.
|
||||
|
||||
Antes de entrar en detalles, comencemos creando un tokenizador dummy en unas cuantas líneas:
|
||||
|
||||
```python
|
||||
>>> from tokenizers import Tokenizer
|
||||
>>> from tokenizers.models import BPE
|
||||
>>> from tokenizers.trainers import BpeTrainer
|
||||
>>> from tokenizers.pre_tokenizers import Whitespace
|
||||
|
||||
>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
||||
>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||
|
||||
>>> tokenizer.pre_tokenizer = Whitespace()
|
||||
>>> files = [...]
|
||||
>>> tokenizer.train(files, trainer)
|
||||
```
|
||||
|
||||
Ahora tenemos un tokenizador entrenado en los archivos que definimos. Lo podemos seguir utilizando en ese entorno de ejecución (runtime en inglés), o puedes guardarlo
|
||||
en un archivo JSON para reutilizarlo en un futuro.
|
||||
|
||||
## Cargando directamente desde el objeto tokenizador
|
||||
|
||||
Veamos cómo utilizar este objeto tokenizador en la biblioteca 🤗 Transformers. La clase
|
||||
[`PreTrainedTokenizerFast`] permite una instanciación fácil, al aceptar el objeto
|
||||
*tokenizer* instanciado como argumento:
|
||||
|
||||
```python
|
||||
>>> from transformers import PreTrainedTokenizerFast
|
||||
|
||||
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
|
||||
```
|
||||
|
||||
Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores
|
||||
](main_classes/tokenizer) para más información.
|
||||
|
||||
## Cargando desde un archivo JSON
|
||||
|
||||
Para cargar un tokenizador desde un archivo JSON, comencemos por guardar nuestro tokenizador:
|
||||
|
||||
```python
|
||||
>>> tokenizer.save("tokenizer.json")
|
||||
```
|
||||
|
||||
La localización (path en inglés) donde este archivo es guardado puede ser incluida en el método de inicialización de [`PreTrainedTokenizerFast`]
|
||||
utilizando el parámetro `tokenizer_file`:
|
||||
|
||||
```python
|
||||
>>> from transformers import PreTrainedTokenizerFast
|
||||
|
||||
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
|
||||
```
|
||||
|
||||
Este objeto ya puede ser utilizado con todos los métodos compartidos por los tokenizadores de 🤗 Transformers! Visita la [página sobre tokenizadores
|
||||
](main_classes/tokenizer) para más información.
|
||||
271
docs/source/es/index.mdx
Normal file
271
docs/source/es/index.mdx
Normal file
@ -0,0 +1,271 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# 🤗 Transformers
|
||||
|
||||
Machine Learning de última generación para PyTorch, TensorFlow y JAX.
|
||||
|
||||
🤗 Transformers proporciona APIs para descargar y entrenar fácilmente modelos preentrenados de última generación. El uso de modelos preentrenados puede reducir tus costos de cómputo, tu huella de carbono y ahorrarte tiempo al entrenar un modelo desde cero. Los modelos se pueden utilizar en diferentes modalidades, tales como:
|
||||
|
||||
* 📝 Texto: clasificación de texto, extracción de información, respuesta a preguntas, resumir, traducción y generación de texto en más de 100 idiomas.
|
||||
* 🖼️ Imágenes: clasificación de imágenes, detección de objetos y segmentación.
|
||||
* 🗣️ Audio: reconocimiento de voz y clasificación de audio.
|
||||
* 🐙 Multimodal: respuesta a preguntas en tablas, reconocimiento óptico de caracteres, extracción de información de documentos escaneados, clasificación de videos y respuesta visual a preguntas.
|
||||
|
||||
Nuestra biblioteca admite una integración perfecta entre tres de las bibliotecas de deep learning más populares: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) y [JAX](https://jax.readthedocs.io/en/latest/). Entrena tu modelo con tres líneas de código en un framework y cárgalo para inferencia con otro.
|
||||
Cada arquitectura de 🤗 Transformers se define en un módulo de Python independiente para que se puedan personalizar fácilmente para investigación y experimentos.
|
||||
|
||||
## Si estás buscando soporte personalizado del equipo de Hugging Face
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/support">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
</a><br>
|
||||
|
||||
## Contenidos
|
||||
|
||||
La documentación está organizada en cuatro partes:
|
||||
|
||||
- **EMPEZAR** contiene un recorrido rápido e instrucciones de instalación para comenzar a usar 🤗 Transformers.
|
||||
- **TUTORIALES** es un excelente lugar para comenzar. Esta sección te ayudará a obtener las habilidades básicas que necesitas para comenzar a usar 🤗 Transformers.
|
||||
- **GUÍAS PRÁCTICAS** te mostrará cómo lograr un objetivo específico, cómo hacer fine-tuning a un modelo preentrenado para el modelado de lenguaje o cómo crear un cabezal para un modelo personalizado.
|
||||
- **GUÍAS CONCEPTUALES** proporciona más discusión y explicación de los conceptos e ideas subyacentes detrás de los modelos, las tareas y la filosofía de diseño de 🤗 Transformers.
|
||||
|
||||
La biblioteca actualmente contiene implementaciones de JAX, PyTorch y TensorFlow, pesos de modelos preentrenados, scripts de uso y utilidades de conversión para los siguientes modelos.
|
||||
|
||||
### Modelos compatibles
|
||||
|
||||
<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
|
||||
|
||||
1. **[ALBERT](model_doc/albert)** (de Google Research y el Instituto Tecnológico de Toyota en Chicago) publicado con el paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), por Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
1. **[BART](model_doc/bart)** (de Facebook) publicado con el paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) por Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov y Luke Zettlemoyer.
|
||||
1. **[BARThez](model_doc/barthez)** (de École polytechnique) publicado con el paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) por Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||
1. **[BARTpho](model_doc/bartpho)** (de VinAI Research) publicado con el paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) por Nguyen Luong Tran, Duong Minh Le y Dat Quoc Nguyen.
|
||||
1. **[BEiT](model_doc/beit)** (de Microsoft) publicado con el paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) por Hangbo Bao, Li Dong, Furu Wei.
|
||||
1. **[BERT](model_doc/bert)** (de Google) publicado con el paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) por Jacob Devlin, Ming-Wei Chang, Kenton Lee y Kristina Toutanova.
|
||||
1. **[BERTweet](model_doc/bertweet)** (de VinAI Research) publicado con el paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) por Dat Quoc Nguyen, Thanh Vu y Anh Tuan Nguyen.
|
||||
1. **[BERT For Sequence Generation](model_doc/bert-generation)** (de Google) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[BigBird-RoBERTa](model_doc/big_bird)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (de Google Research) publicado con el paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) por Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||
1. **[Blenderbot](model_doc/blenderbot)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BlenderbotSmall](model_doc/blenderbot-small)** (de Facebook) publicado con el paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) por Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||
1. **[BORT](model_doc/bort)** (de Alexa) publicado con el paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) por Adrian de Wynter y Daniel J. Perry.
|
||||
1. **[ByT5](model_doc/byt5)** (de Google Research) publicado con el paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) por Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||
1. **[CamemBERT](model_doc/camembert)** (de Inria/Facebook/Sorbonne) publicado con el paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) por Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah y Benoît Sagot.
|
||||
1. **[CANINE](model_doc/canine)** (de Google Research) publicado con el paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) por Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||
1. **[ConvNeXT](model_doc/convnext)** (de Facebook AI) publicado con el paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) por Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||
1. **[CLIP](model_doc/clip)** (de OpenAI) publicado con el paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) por Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||
1. **[ConvBERT](model_doc/convbert)** (de YituTech) publicado con el paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) por Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||
1. **[CPM](model_doc/cpm)** (de Universidad de Tsinghua) publicado con el paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) por Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||
1. **[CTRL](model_doc/ctrl)** (de Salesforce) publicado con el paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) por Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong y Richard Socher.
|
||||
1. **[Data2Vec](model_doc/data2vec)** (de Facebook) publicado con el paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) por Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||
1. **[DeBERTa](model_doc/deberta)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[DeBERTa-v2](model_doc/deberta-v2)** (de Microsoft) publicado con el paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) por Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||
1. **[Decision Transformer](model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publicado con el paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) por Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
1. **[DiT](model_doc/dit)** (de Microsoft Research) publicado con el paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) por Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
|
||||
1. **[DeiT](model_doc/deit)** (de Facebook) publicado con el paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) por Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
||||
1. **[DETR](model_doc/detr)** (de Facebook) publicado con el paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) por Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||
1. **[DialoGPT](model_doc/dialogpt)** (de Microsoft Research) publicado con el paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) por Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||
1. **[DistilBERT](model_doc/distilbert)** (de HuggingFace), publicado junto con el paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) por Victor Sanh, Lysandre Debut y Thomas Wolf. Se ha aplicado el mismo método para comprimir GPT2 en [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), BERT multilingüe en [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) y una versión alemana de DistilBERT.
|
||||
1. **[DPR](model_doc/dpr)** (de Facebook) publicado con el paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) por Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, y Wen-tau Yih.
|
||||
1. **[DPT](master/model_doc/dpt)** (de Intel Labs) publicado con el paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) por René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
|
||||
1. **[EncoderDecoder](model_doc/encoder-decoder)** (de Google Research) publicado con el paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) por Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||
1. **[ELECTRA](model_doc/electra)** (de Google Research/Universidad de Stanford) publicado con el paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) por Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||
1. **[FlauBERT](model_doc/flaubert)** (de CNRS) publicado con el paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) por Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
1. **[FNet](model_doc/fnet)** (de Google Research) publicado con el paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) por James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||
1. **[Funnel Transformer](model_doc/funnel)** (de CMU/Google Brain) publicado con el paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) por Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||
1. **[GLPN](model_doc/glpn)** (de KAIST) publicado con el paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) por Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
|
||||
1. **[GPT](model_doc/openai-gpt)** (de OpenAI) publicado con el paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) por Alec Radford, Karthik Narasimhan, Tim Salimans y Ilya Sutskever.
|
||||
1. **[GPT-2](model_doc/gpt2)** (de OpenAI) publicado con el paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) por Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** y Ilya Sutskever**.
|
||||
1. **[GPT-J](model_doc/gptj)** (de EleutherAI) publicado con el repositorio [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) por Ben Wang y Aran Komatsuzaki.
|
||||
1. **[GPT Neo](model_doc/gpt_neo)** (de EleutherAI) publicado en el paper [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) por Sid Black, Stella Biderman, Leo Gao, Phil Wang y Connor Leahy.
|
||||
1. **[Hubert](model_doc/hubert)** (de Facebook) publicado con el paper [HuBERT: Self-Supervised Speech Representation Learning por Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) por Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
|
||||
1. **[I-BERT](model_doc/ibert)** (de Berkeley) publicado con el paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) por Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
|
||||
1. **[ImageGPT](model_doc/imagegpt)** (de OpenAI) publicado con el paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) por Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
|
||||
1. **[LayoutLM](model_doc/layoutlm)** (de Microsoft Research Asia) publicado con el paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) por Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
|
||||
1. **[LayoutLMv2](model_doc/layoutlmv2)** (de Microsoft Research Asia) publicado con el paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) por Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
|
||||
1. **[LayoutXLM](model_doc/layoutlmv2)** (de Microsoft Research Asia) publicado con el paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) por Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
|
||||
1. **[LED](model_doc/led)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[Longformer](model_doc/longformer)** (de AllenAI) publicado con el paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) por Iz Beltagy, Matthew E. Peters, Arman Cohan.
|
||||
1. **[LUKE](model_doc/luke)** (de Studio Ousia) publicado con el paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) por Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
|
||||
1. **[mLUKE](model_doc/mluke)** (de Studio Ousia) publicado con el paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) por Ryokan Ri, Ikuya Yamada, y Yoshimasa Tsuruoka.
|
||||
1. **[LXMERT](model_doc/lxmert)** (de UNC Chapel Hill) publicado con el paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) por Hao Tan y Mohit Bansal.
|
||||
1. **[M2M100](model_doc/m2m_100)** (de Facebook) publicado con el paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) por Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||
1. **[MarianMT](model_doc/marian)** Modelos de traducción automática entrenados usando [OPUS](http://opus.nlpl.eu/) data por Jörg Tiedemann. El [Marian Framework](https://marian-nmt.github.io/) está siendo desarrollado por el equipo de traductores de Microsoft.
|
||||
1. **[MaskFormer](model_doc/maskformer)** (de Meta y UIUC) publicado con el paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) por Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||
1. **[MBart](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) por Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
1. **[MBart-50](model_doc/mbart)** (de Facebook) publicado con el paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) por Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||
1. **[Megatron-BERT](model_doc/megatron-bert)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro.
|
||||
1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (de NVIDIA) publicado con el paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) por Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper y Bryan Catanzaro.
|
||||
1. **[MPNet](model_doc/mpnet)** (de Microsoft Research) publicado con el paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) por Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||
1. **[MT5](model_doc/mt5)** (de Google AI) publicado con el paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) por Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||
1. **[Nyströmformer](model_doc/nystromformer)** (de la Universidad de Wisconsin - Madison) publicado con el paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) por Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||
1. **[Pegasus](model_doc/pegasus)** (de Google) publicado con el paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) por Jingqing Zhang, Yao Zhao, Mohammad Saleh y Peter J. Liu.
|
||||
1. **[Perceiver IO](model_doc/perceiver)** (de Deepmind) publicado con el paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) por Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||
1. **[PhoBERT](model_doc/phobert)** (de VinAI Research) publicado con el paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) por Dat Quoc Nguyen y Anh Tuan Nguyen.
|
||||
1. **[PLBart](model_doc/plbart)** (de UCLA NLP) publicado con el paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) por Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||
1. **[PoolFormer](model_doc/poolformer)** (de Sea AI Labs) publicado con el paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) por Yu, Weihao y Luo, Mi y Zhou, Pan y Si, Chenyang y Zhou, Yichen y Wang, Xinchao y Feng, Jiashi y Yan, Shuicheng.
|
||||
1. **[ProphetNet](model_doc/prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou.
|
||||
1. **[QDQBert](model_doc/qdqbert)** (de NVIDIA) publicado con el paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) por Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev y Paulius Micikevicius.
|
||||
1. **[REALM](model_doc/realm.html)** (de Google Research) publicado con el paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) por Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat y Ming-Wei Chang.
|
||||
1. **[Reformer](model_doc/reformer)** (de Google Research) publicado con el paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) por Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||
1. **[RemBERT](model_doc/rembert)** (de Google Research) publicado con el paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) por Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||
1. **[RegNet](model_doc/regnet)** (de META Platforms) publicado con el paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) por Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
|
||||
1. **[ResNet](model_doc/resnet)** (de Microsoft Research) publicado con el paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) por Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
|
||||
1. **[RoBERTa](model_doc/roberta)** (de Facebook), publicado junto con el paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) por Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||
1. **[RoFormer](model_doc/roformer)** (de ZhuiyiTechnology), publicado junto con el paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) por Jianlin Su y Yu Lu y Shengfeng Pan y Bo Wen y Yunfeng Liu.
|
||||
1. **[SegFormer](model_doc/segformer)** (de NVIDIA) publicado con el paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) por Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||
1. **[SEW](model_doc/sew)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SEW-D](model_doc/sew_d)** (de ASAPP) publicado con el paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) por Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||
1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (de Facebook), publicado junto con el paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) por Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
|
||||
1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (de Facebook), publicado junto con el paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) por Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||
1. **[Splinter](model_doc/splinter)** (de Universidad de Tel Aviv), publicado junto con el paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) pory Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||
1. **[SqueezeBert](model_doc/squeezebert)** (de Berkeley) publicado con el paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) por Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, y Kurt W. Keutzer.
|
||||
1. **[Swin Transformer](model_doc/swin)** (de Microsoft) publicado con el paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) por Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||
1. **[T5](model_doc/t5)** (de Google AI) publicado con el paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu.
|
||||
1. **[T5v1.1](model_doc/t5v1.1)** (de Google AI) publicado en el repositorio [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) por Colin Raffel y Noam Shazeer y Adam Roberts y Katherine Lee y Sharan Narang y Michael Matena y Yanqi Zhou y Wei Li y Peter J. Liu.
|
||||
1. **[TAPAS](model_doc/tapas)** (de Google AI) publicado con el paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) por Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno y Julian Martin Eisenschlos.
|
||||
1. **[TAPEX](model_doc/tapex)** (de Microsoft Research) publicado con el paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) por Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
|
||||
1. **[Transformer-XL](model_doc/transfo-xl)** (de Google/CMU) publicado con el paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) por Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
1. **[TrOCR](model_doc/trocr)** (de Microsoft), publicado junto con el paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) por Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||
1. **[UniSpeech](model_doc/unispeech)** (de Microsoft Research) publicado con el paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) por Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||
1. **[UniSpeechSat](model_doc/unispeech-sat)** (de Microsoft Research) publicado con el paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) por Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||
1. **[VAN](model_doc/van)** (de la Universidad de Tsinghua y la Universidad de Nankai) publicado con el paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) por Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[ViLT](model_doc/vilt)** (de NAVER AI Lab/Kakao Enterprise/Kakao Brain) publicado con el paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) por Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[Vision Transformer (ViT)](model_doc/vit)** (de Google AI) publicado con el paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) por Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[ViTMAE](model_doc/vit_mae)** (de Meta AI) publicado con el paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) por Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||
1. **[VisualBERT](model_doc/visual_bert)** (de UCLA NLP) publicado con el paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) por Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[WavLM](model_doc/wavlm)** (de Microsoft Research) publicado con el paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) por Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||
1. **[Wav2Vec2](model_doc/wav2vec2)** (de Facebook AI) publicado con el paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) por Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (de Facebook AI) publicado con el paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) por Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||
1. **[XGLM](model_doc/xglm)** (de Facebook AI) publicado con el paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) por Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||
1. **[XLM](model_doc/xlm)** (de Facebook) publicado junto con el paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) por Guillaume Lample y Alexis Conneau.
|
||||
1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (de Microsoft Research) publicado con el paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) por Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang y Ming Zhou.
|
||||
1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (de Facebook AI), publicado junto con el paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) por Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer y Veselin Stoyanov.
|
||||
1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (de Facebook AI), publicado junto con el paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) por Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||
1. **[XLNet](model_doc/xlnet)** (de Google/CMU) publicado con el paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) por Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (de Facebook AI) publicado con el paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) por Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||
1. **[XLS-R](model_doc/xls_r)** (de Facebook AI) publicado con el paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) por Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||
1. **[YOSO](model_doc/yoso)** (de la Universidad de Wisconsin-Madison) publicado con el paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) por Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||
|
||||
|
||||
### Frameworks compatibles
|
||||
|
||||
La siguiente tabla representa el soporte actual en la biblioteca para cada uno de esos modelos, ya sea que tengan un tokenizador de Python (llamado "slow"). Un tokenizador "fast" respaldado por la biblioteca 🤗 Tokenizers, ya sea que tengan soporte en Jax (a través de
|
||||
Flax), PyTorch y/o TensorFlow.
|
||||
|
||||
<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
|
||||
|
||||
| Modelo | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
|
||||
|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
|
||||
| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| BART | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| BERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| BigBird | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Blenderbot | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| Canine | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DETR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| DPR | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| DPT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| FNet | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| GLPN | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| LayoutLM | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| LED | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| LUKE | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Marian | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||
| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| mBART | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Nystromformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| RAG | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Realm | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RegNet | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ResNet | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| SEW | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| Swin | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| T5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| TAPEX | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| VAN | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ViT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| ViTMAE | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||
| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
|
||||
<!-- End table-->
|
||||
@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
# Guía de instalación
|
||||
# Instalación
|
||||
|
||||
En esta guía puedes encontrar información para instalar 🤗 Transformers para cualquier biblioteca de Machine Learning con la que estés trabajando. Además, encontrarás información sobre cómo establecer el caché y cómo configurar 🤗 Transformers para correrlo de manera offline (opcional).
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user