Compare commits
651 Commits
v4.14.1
...
doc_builde
Author | SHA1 | Date | |
---|---|---|---|
d7503f5068 | |||
e82782cc10 | |||
70203b5937 | |||
62d847602a | |||
ab2f8d12a7 | |||
72983303c5 | |||
f5a080dd10 | |||
91fb62d01c | |||
ea07064a5c | |||
b19f3e69a0 | |||
9879a1d5f0 | |||
8b9ae45549 | |||
38cc35069c | |||
c87cfd653c | |||
e9fa7cd5d7 | |||
2596f95e84 | |||
1a62b25caf | |||
544fd9876b | |||
60b81dfa6f | |||
ef9c3ca348 | |||
9932ee4b4b | |||
e8efaecb87 | |||
5c6f57ee75 | |||
040c11f6da | |||
f0aacc140b | |||
10b76987fc | |||
01485ceec3 | |||
89c7d9cfba | |||
7ade7c1794 | |||
6b104c5bb0 | |||
b71474895d | |||
a6e3b17981 | |||
a7df656f03 | |||
c0281feb50 | |||
9251427c38 | |||
742273a52a | |||
7c45fe747f | |||
3822e4a563 | |||
79d28e80b6 | |||
6cbfa7bf4c | |||
b693cbf99c | |||
3c4fbc616f | |||
7b3bd1f21a | |||
439de3f7f9 | |||
4cd7ed4b3b | |||
39249c9589 | |||
3d2242869d | |||
89be34c36c | |||
130b987880 | |||
baab5e7cdf | |||
96ae92be8c | |||
8fd4731072 | |||
d83d22f578 | |||
e535c389aa | |||
2eb7bb15e7 | |||
05c237ea94 | |||
6e57a56987 | |||
8a133490bf | |||
40040727ab | |||
4bfe75bd08 | |||
d1a29078c0 | |||
b842d7277a | |||
6ccfa2170c | |||
26426923b7 | |||
00eaffc81f | |||
afca0d5192 | |||
286fdc6b3c | |||
7ff9d450cd | |||
c008afea3c | |||
e064f08150 | |||
3f2e636850 | |||
54f0db4066 | |||
9863f7d228 | |||
df5a4094a6 | |||
ddbb485c41 | |||
97f9b8a27b | |||
410e26c7ad | |||
e3342edc4e | |||
935a76d90d | |||
84eaa6acf5 | |||
0b5bf6abef | |||
0118c4f6a8 | |||
fd5b05eb81 | |||
bf1fe32824 | |||
8635407bc7 | |||
4818bf7aed | |||
ad0d7d1745 | |||
7566734d6f | |||
7963578fc5 | |||
074645e32a | |||
b7e292aebd | |||
cbf4391177 | |||
2f0f9038e2 | |||
ca57b45071 | |||
35ecf99cc4 | |||
d1fcc90abf | |||
7f921bcf47 | |||
bb7949b35a | |||
309e87e25e | |||
c475f3ce2d | |||
6336017c15 | |||
a0e3480699 | |||
4c737f0e40 | |||
d3ae2bd3cf | |||
0400b2263d | |||
29c10a41d0 | |||
fecb08c2b8 | |||
86636f52a9 | |||
a1efc82362 | |||
3f76bf54ff | |||
32f5de10a0 | |||
9e71d46455 | |||
1b23979736 | |||
de737866f2 | |||
a3e607d19e | |||
24588c6731 | |||
f9582c205a | |||
05a12a090d | |||
db57bb2b71 | |||
3db2e8f92b | |||
2cdb6dbee5 | |||
c44d3675c2 | |||
32295b15a1 | |||
2c3fcc647a | |||
38bed912e3 | |||
0187c6f0ad | |||
3956b133b6 | |||
142b69f24b | |||
86119c1154 | |||
5444687f0f | |||
a63bd3675f | |||
2c2a31ffbc | |||
60ba48205e | |||
3de12906c8 | |||
83f45cd656 | |||
d5083c333f | |||
ae1f835028 | |||
2f2fefd6af | |||
68dec6bffd | |||
f8ff3fad87 | |||
416dff736c | |||
e93763d420 | |||
240cc6cbdc | |||
57882177be | |||
426b96230a | |||
92a537d938 | |||
f84e0dbd2a | |||
0e91f885c3 | |||
f65fe3663a | |||
66828a19b1 | |||
3a4376d008 | |||
cdc51ffd27 | |||
bc3379e12c | |||
d4692ad161 | |||
b87c044c79 | |||
2d02f7b29b | |||
bee361c6f1 | |||
e3d1a8dabc | |||
1ddf3c2b74 | |||
943e2aa036 | |||
1690319217 | |||
faf4ff5974 | |||
2e12b907ae | |||
a3dbbc3467 | |||
cdf19c501d | |||
28e6155d8a | |||
5d8be090e0 | |||
f45ac11fb3 | |||
80f1a59168 | |||
7bc4a01cb5 | |||
67047b86ce | |||
45f56580a7 | |||
86a7845c0c | |||
9eb7e9ba1d | |||
e1cbc073bf | |||
05a8580964 | |||
41168a49ce | |||
041fdc4a7e | |||
e314c19a3f | |||
2e11a04337 | |||
0f71c29053 | |||
b090b79022 | |||
ec15da2445 | |||
2b8599b2df | |||
f52746d004 | |||
52d2e6f6e9 | |||
4f403ea899 | |||
7a32e4722f | |||
fcb0f74397 | |||
f15c99fabf | |||
2dce350b33 | |||
85aee09e9a | |||
2f40c728c9 | |||
8c03df1010 | |||
3fae83d23a | |||
7e4844fc2a | |||
6cf06d198c | |||
45c7b5b1c7 | |||
c0864d98ba | |||
2e8b85f72e | |||
3a2ed96714 | |||
724e51c6e6 | |||
3d5dea9bf0 | |||
cb7ed6e083 | |||
e923917cd9 | |||
644ec05233 | |||
c722753afd | |||
a86ee2261e | |||
dee17d5676 | |||
258480864d | |||
315e67404d | |||
b1ba03e082 | |||
eed3186b79 | |||
2b5603f6ac | |||
0113aae5b7 | |||
f588cf4050 | |||
7029240927 | |||
9e00566b9b | |||
1f60bc46f3 | |||
7732d0fe7a | |||
d923f76203 | |||
b5c6fdecf0 | |||
ba3f9a71a1 | |||
a6885db912 | |||
fcb4f11c92 | |||
077c00c0b2 | |||
8406fa6dd5 | |||
6a5472a8e1 | |||
0acd84f7cb | |||
87d08afb16 | |||
0fe17f375a | |||
552f8d3091 | |||
ad1d3c4d4b | |||
131e258411 | |||
6775b211b6 | |||
7a1412e12b | |||
a459f7f97d | |||
75b13f82e9 | |||
84eec9e6ba | |||
c47d259241 | |||
5f1918a4a8 | |||
e02bdce791 | |||
8ce1330631 | |||
ac6aa10f23 | |||
31be2f45a9 | |||
bbe9c6981b | |||
854a0d526c | |||
486260c68e | |||
525dbbf84a | |||
21dcaec5d5 | |||
f1a4c4ead5 | |||
4f5faaf044 | |||
90166121ee | |||
e2b6e73fa2 | |||
f5d98da29e | |||
71dccd0774 | |||
5ec368d79e | |||
39b5d1a63a | |||
45cac3fade | |||
c74f3d4c48 | |||
13297ac71c | |||
dd360d58d9 | |||
44b21f117b | |||
623d8cb475 | |||
d718c0c3a8 | |||
1d94d57546 | |||
b9418a1d97 | |||
c157c7e3fd | |||
d0b5ed110a | |||
8e5d4e4906 | |||
37800f1365 | |||
f427e75049 | |||
7b8bdd8601 | |||
6d585fe0f0 | |||
d2749cf72e | |||
1c9648c457 | |||
2ca6268394 | |||
dc05dd539f | |||
af5c3329d7 | |||
d12ae81664 | |||
d4f201b860 | |||
0c17e766cb | |||
125a2882b4 | |||
d984b10335 | |||
09f9d07271 | |||
6915174e68 | |||
a5ecbf7348 | |||
5a70987301 | |||
87918d3221 | |||
b8810847d0 | |||
3385ca2582 | |||
7e56ba2864 | |||
554d333ece | |||
44c7857b87 | |||
47df0f2234 | |||
7fc6f41d91 | |||
282ae123e2 | |||
d4b3e56d64 | |||
38dfb40ae3 | |||
f624249d8b | |||
3254080d45 | |||
aa19f478ac | |||
0094eba363 | |||
ee5de66349 | |||
0f69b924fb | |||
f380bf2b61 | |||
e09473a817 | |||
16d4acbfdb | |||
cabd6d26a2 | |||
c98a6ac211 | |||
db07956740 | |||
297602c7f4 | |||
d25e25ee2b | |||
b6b79faa7e | |||
6df29ba5e6 | |||
507601a5cf | |||
4996922b6d | |||
8f5d62fdb1 | |||
06107541d3 | |||
c4d1fd77fa | |||
2e4559fa37 | |||
f5db6ce76a | |||
0b07230409 | |||
628b59e51d | |||
ca0848b2ff | |||
7d45a2e81c | |||
a81fd35524 | |||
eab338104d | |||
f87db5e412 | |||
c43749289d | |||
8f6454bfac | |||
2de90beeeb | |||
fa6dce250f | |||
ade7371a41 | |||
196cce6e9b | |||
6beae766ee | |||
da5ef25db9 | |||
9f831bdeaf | |||
4df69506a8 | |||
fc8fc400e3 | |||
99a2771189 | |||
6292532fd1 | |||
19732cc07a | |||
5d8b98608c | |||
96161ac408 | |||
24e2fa1590 | |||
e79a0faeae | |||
0501beb846 | |||
637e81752a | |||
e695470794 | |||
e65bfc0971 | |||
81156d20cd | |||
457dd4392b | |||
8d6acc6c29 | |||
e239fc3b0b | |||
dcaa5100c9 | |||
c15bb3fe19 | |||
eac4aecc3d | |||
2390b2cf65 | |||
c972433a85 | |||
4bf97415a4 | |||
b7cb126ccc | |||
6ac77534bf | |||
80af1048cf | |||
4cff3fae11 | |||
f6253147df | |||
7799b6128f | |||
11afb709ec | |||
3c3cf17a49 | |||
95a75a715f | |||
833635e259 | |||
183ce067e0 | |||
b4ce313e6c | |||
126bddd1ba | |||
c962c2adbf | |||
6c7b68d414 | |||
d43e308e7f | |||
515ed3ad2a | |||
ad7390636d | |||
57820456bd | |||
1fc0fa4617 | |||
f00f22a3e2 | |||
4a6a35bc65 | |||
08b41b413a | |||
85ea462c08 | |||
e57468b8a8 | |||
baf1ebe9f0 | |||
3fefee9910 | |||
80f7296091 | |||
ac227093e4 | |||
691878ee2f | |||
f4b7420dfe | |||
6a3c883c8b | |||
f778edb739 | |||
2a5a384970 | |||
842298f84f | |||
6d92c429c7 | |||
19c217b4b7 | |||
5439cda7f0 | |||
841d979190 | |||
021b52e7a8 | |||
653379c094 | |||
2708bfa127 | |||
d1f5ca1afd | |||
db3503949d | |||
fe78fe98ca | |||
2c335037bd | |||
e118e085ea | |||
1a354d53c4 | |||
2085f20901 | |||
979ca24e39 | |||
7b3d4df47a | |||
74bec9865c | |||
2ae3be5442 | |||
84c60a7b50 | |||
6f0a9b41ef | |||
497346d07e | |||
1144d336b6 | |||
531336bbfd | |||
f6d3fee855 | |||
22454ae492 | |||
b25067d807 | |||
dea563c943 | |||
32090c729f | |||
6f8e644f09 | |||
edd3fce2f7 | |||
9a2dabae70 | |||
0167edc854 | |||
7a787c68c6 | |||
669e3c50c9 | |||
ebc4edfe7a | |||
1b730c3d11 | |||
b212ff9f49 | |||
7d9a33fb5c | |||
4663c609b9 | |||
735d2bb69b | |||
51d7ebf260 | |||
4aa16fce6c | |||
7cbf8429d9 | |||
c4f7eb124b | |||
5f3c57fc84 | |||
96881729ce | |||
1eb40338ac | |||
6e058e84fd | |||
3fc221d077 | |||
7b83feb50a | |||
762416ffa8 | |||
74837171ab | |||
6950ccec1b | |||
9a94bb8e21 | |||
97f3beed36 | |||
021f2ea987 | |||
c425d60bb9 | |||
6820904454 | |||
aa0135f2e0 | |||
27b819b0e3 | |||
68cc4ccde2 | |||
1a00863e95 | |||
44eaa2b303 | |||
57b980a613 | |||
16f0b7d72c | |||
704d1feca1 | |||
68d925195e | |||
7480ded658 | |||
28e091430e | |||
444ea95a80 | |||
285131bfb4 | |||
c4fa908fa9 | |||
efb35a4107 | |||
6ea6266625 | |||
68810aa26c | |||
ca76618d6b | |||
9dc8fb2fc7 | |||
5cd7086fdb | |||
4e3208662e | |||
ac2c06d492 | |||
bf0201e184 | |||
b67fd797be | |||
c9504b2f50 | |||
daec528ca9 | |||
0554e4d5c5 | |||
7ec6aad23d | |||
03f8b9c9e0 | |||
37bc0b4e53 | |||
20f169b523 | |||
3e9fdcf019 | |||
4fbc924d0a | |||
61d18ae035 | |||
222c09a635 | |||
31838d3e11 | |||
84f360e862 | |||
9f33116898 | |||
20fa9eb035 | |||
16b6df6fca | |||
f21bc4215a | |||
f012c00ada | |||
af9cb94974 | |||
533624c5a9 | |||
b2c477fc6d | |||
42d57549b8 | |||
a54961c5f7 | |||
9fbf7c87c3 | |||
0a03a86813 | |||
d72343d2b8 | |||
768e6c1449 | |||
623b4f7c63 | |||
5be1242ac0 | |||
484e7a441f | |||
ac224bb079 | |||
f18c6fa94c | |||
1d71227295 | |||
e36a83d3a3 | |||
cac877425c | |||
794441c379 | |||
f872f18dca | |||
8d187e7feb | |||
cc406da4de | |||
59fb636948 | |||
25b8b8a6f2 | |||
b67f345d00 | |||
f71fb5c36e | |||
d2183a46fb | |||
83c552d390 | |||
5ab87cd4da | |||
5a06118b39 | |||
9f89fa02ed | |||
2e9af29494 | |||
443fdaf29f | |||
ae929dcbbd | |||
65cb94ff77 | |||
e34dd055e9 | |||
927f654427 | |||
2380136722 | |||
857ab55c01 | |||
19d37c2dd3 | |||
21aecc0971 | |||
9e1775dd23 | |||
774ed4a027 | |||
f2ab21833f | |||
dbac8899fe | |||
0b4c3a1a53 | |||
38f95d1846 | |||
d33dc7966a | |||
8c2618e6aa | |||
8f6373c61c | |||
e68c3756fe | |||
08cb5718ec | |||
f8a989cfb2 | |||
c043ce6cfd | |||
a1392883ce | |||
d7d60df0ec | |||
d1ba56d8d8 | |||
04cddaf402 | |||
600496fa50 | |||
1bfa347707 | |||
f80775df2b | |||
1e847b40c0 | |||
1c121916f3 | |||
10fd4fa1a6 | |||
2c5597f6c7 | |||
b5e2b183af | |||
e13f72fbff | |||
133c5e40c4 | |||
b2f500256e | |||
87e6e4fe5c | |||
c1138273d4 | |||
03885a3f50 | |||
501307b58b | |||
b058490ceb | |||
705ca7f21b | |||
116829900a | |||
415810664b | |||
676643c6d6 | |||
f566c6e3b7 | |||
fe4197ab11 | |||
86b40073e9 | |||
ee55ea692b | |||
ef47d4f848 | |||
8f2cc1c3ab | |||
2d30443cd3 | |||
1d651868d6 | |||
6b655cc63f | |||
6a7b9da2ae | |||
d8c09c6541 | |||
4210579522 | |||
355dc0ce67 | |||
207594be81 | |||
b0c7d2ec58 | |||
fa39ff9fc4 | |||
05fa1a7ac1 | |||
87a033d9fa | |||
13504dcbea | |||
1045a36c1f | |||
7df4b90c76 | |||
e37bc579fc | |||
17efc806b4 | |||
2a56edb321 | |||
824fd44fc3 | |||
c94c1b8967 | |||
ec3567fe20 | |||
d0422de563 | |||
e846a56ca4 | |||
a6b7b47a39 | |||
eec9c8bbd7 | |||
e51c7b5872 | |||
27b3031de2 | |||
185876392c | |||
033c3ed95a | |||
c075fb7855 | |||
5722d05831 | |||
0062058399 | |||
7ae6f07004 | |||
97ec17f73b | |||
b513ec8bbd | |||
7af80f6618 | |||
2a33734606 | |||
b6ec956976 | |||
c1125dc2ba | |||
33f36c869f | |||
0940e9b242 | |||
b37cf7dee4 | |||
952a77b05d | |||
8a818c26cb | |||
1b0ca7d270 | |||
1531b31978 | |||
3883e3a75e | |||
cd583bdaa5 | |||
281e1fba75 | |||
091693b494 | |||
84ea427f46 | |||
c4a96cecbc | |||
77d6c826d8 | |||
0b4ea79a0c | |||
ff066119ca | |||
95119ad7b0 | |||
bdbe3df869 | |||
cbf036f7ae | |||
c4a0fb5199 | |||
d194d639ab | |||
bef1e3e4a0 | |||
b18d8534ea | |||
48463ebb33 | |||
2e07180cba | |||
465a8b8d10 | |||
8ae24e19b2 | |||
12e1b4c6df | |||
5061a9fd55 | |||
8010fda9bf | |||
459677aebe | |||
c40ecfd740 | |||
7c9c41f43c |
@ -78,7 +78,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
|
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -99,7 +99,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_torch_and_tf_all:
|
run_tests_torch_and_tf_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -116,7 +116,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
|
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -149,7 +149,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch_and_flax-{{ checksum "setup.py" }}
|
- v0.4-torch_and_flax-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
|
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -169,7 +169,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_torch_and_flax_all:
|
run_tests_torch_and_flax_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -186,7 +186,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch_and_flax-{{ checksum "setup.py" }}
|
- v0.4-torch_and_flax-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
|
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -217,7 +217,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch-{{ checksum "setup.py" }}
|
- v0.4-torch-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -237,7 +237,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_torch_all:
|
run_tests_torch_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -253,7 +253,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch-{{ checksum "setup.py" }}
|
- v0.4-torch-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -284,7 +284,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-tf-{{ checksum "setup.py" }}
|
- v0.4-tf-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
|
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
|
||||||
- run: pip install tensorflow_probability
|
- run: pip install tensorflow_probability
|
||||||
@ -304,7 +304,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_tf_all:
|
run_tests_tf_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -320,7 +320,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-tf-{{ checksum "setup.py" }}
|
- v0.4-tf-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
|
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
|
||||||
- run: pip install tensorflow_probability
|
- run: pip install tensorflow_probability
|
||||||
@ -351,7 +351,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-flax-{{ checksum "setup.py" }}
|
- v0.4-flax-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[flax,testing,sentencepiece,flax-speech,vision]
|
- run: pip install .[flax,testing,sentencepiece,flax-speech,vision]
|
||||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
@ -370,7 +370,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_flax_all:
|
run_tests_flax_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -386,7 +386,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-flax-{{ checksum "setup.py" }}
|
- v0.4-flax-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[flax,testing,sentencepiece,vision,flax-speech]
|
- run: pip install .[flax,testing,sentencepiece,vision,flax-speech]
|
||||||
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
- run: pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
@ -417,7 +417,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch-{{ checksum "setup.py" }}
|
- v0.4-torch-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -437,7 +437,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_pipelines_torch_all:
|
run_tests_pipelines_torch_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -454,7 +454,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch-{{ checksum "setup.py" }}
|
- v0.4-torch-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
|
||||||
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
|
||||||
@ -549,7 +549,7 @@ jobs:
|
|||||||
- v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
- v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[ja,testing,sentencepiece,jieba]
|
- run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy]
|
||||||
- run: python -m unidic download
|
- run: python -m unidic download
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||||
@ -557,7 +557,11 @@ jobs:
|
|||||||
- '~/.cache/pip'
|
- '~/.cache/pip'
|
||||||
- run: |
|
- run: |
|
||||||
if [ -f test_list.txt ]; then
|
if [ -f test_list.txt ]; then
|
||||||
python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
|
python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
|
||||||
|
fi
|
||||||
|
- run: |
|
||||||
|
if [ -f test_list.txt ]; then
|
||||||
|
python -m pytest -n 1 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
|
||||||
fi
|
fi
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
@ -579,7 +583,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch_examples-{{ checksum "setup.py" }}
|
- v0.4-torch_examples-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
|
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
|
||||||
- run: pip install -r examples/pytorch/_tests_requirements.txt
|
- run: pip install -r examples/pytorch/_tests_requirements.txt
|
||||||
@ -614,7 +618,7 @@ jobs:
|
|||||||
keys:
|
keys:
|
||||||
- v0.4-torch_examples-{{ checksum "setup.py" }}
|
- v0.4-torch_examples-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
|
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
|
||||||
- run: pip install -r examples/pytorch/_tests_requirements.txt
|
- run: pip install -r examples/pytorch/_tests_requirements.txt
|
||||||
@ -662,7 +666,7 @@ jobs:
|
|||||||
path: ~/transformers/flax_examples_output.txt
|
path: ~/transformers/flax_examples_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_examples_flax_all:
|
run_examples_flax_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -729,7 +733,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_hub_all:
|
run_tests_hub_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@ -795,7 +799,7 @@ jobs:
|
|||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_tests_onnxruntime_all:
|
run_tests_onnxruntime_all:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
|
2
.github/ISSUE_TEMPLATE/bug-report.md
vendored
@ -49,7 +49,7 @@ Library:
|
|||||||
- Deepspeed: @stas00
|
- Deepspeed: @stas00
|
||||||
- Ray/raytune: @richardliaw, @amogkam
|
- Ray/raytune: @richardliaw, @amogkam
|
||||||
- Text generation: @patrickvonplaten @narsil
|
- Text generation: @patrickvonplaten @narsil
|
||||||
- Tokenizers: @LysandreJik
|
- Tokenizers: @SaulLu
|
||||||
- Trainer: @sgugger
|
- Trainer: @sgugger
|
||||||
- Pipelines: @Narsil
|
- Pipelines: @Narsil
|
||||||
- Speech: @patrickvonplaten, @anton-l
|
- Speech: @patrickvonplaten, @anton-l
|
||||||
|
61
.github/workflows/add-model-like.yml
vendored
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
name: Add model like runner
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- "src/**"
|
||||||
|
- "tests/**"
|
||||||
|
- ".github/**"
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_tests_templates_like:
|
||||||
|
name: "Add new model like template tests"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: Loading cache.
|
||||||
|
uses: actions/cache@v2
|
||||||
|
id: cache
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pip
|
||||||
|
key: v1-tests_model_like
|
||||||
|
restore-keys: |
|
||||||
|
v1-tests_model_like-${{ hashFiles('setup.py') }}
|
||||||
|
v1-tests_model_like
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip!=21.3
|
||||||
|
pip install -U click # Click 7 is installed in the environment by default, but we need at least version 8 for Black
|
||||||
|
sudo apt -y update && sudo apt install -y libsndfile1-dev
|
||||||
|
pip install .[dev]
|
||||||
|
|
||||||
|
- name: Create model files
|
||||||
|
run: |
|
||||||
|
transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
|
||||||
|
make style
|
||||||
|
make fix-copies
|
||||||
|
|
||||||
|
- name: Run all PyTorch modeling test
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_new_models tests/bert_new/test_modeling_bert_new.py
|
||||||
|
|
||||||
|
- name: Run style changes
|
||||||
|
run: |
|
||||||
|
make style && make quality && make repo-consistency
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_new_models/failures_short.txt
|
||||||
|
|
||||||
|
- name: Test suite reports artifacts
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: run_all_tests_new_models_test_reports
|
||||||
|
path: reports/tests_new_models
|
145
.github/workflows/build-docker-images.yml
vendored
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
name: Build docker images (scheduled)
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- docker-image*
|
||||||
|
repository_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: "0 1 * * *"
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: docker-images-builds
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
latest-docker:
|
||||||
|
name: "Latest PyTorch + TensorFlow [dev]"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Check out code
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./docker/transformers-all-latest-gpu
|
||||||
|
build-args: |
|
||||||
|
REF=master
|
||||||
|
push: true
|
||||||
|
tags: huggingface/transformers-all-latest-gpu
|
||||||
|
|
||||||
|
latest-torch-deepspeed-docker:
|
||||||
|
name: "Latest PyTorch + DeepSpeed"
|
||||||
|
needs: latest-docker
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Check out code
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
|
||||||
|
build-args: |
|
||||||
|
REF=master
|
||||||
|
push: true
|
||||||
|
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||||
|
|
||||||
|
doc-builder:
|
||||||
|
name: "Doc builder"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Check out code
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./docker/transformers-doc-builder
|
||||||
|
push: true
|
||||||
|
tags: huggingface/transformers-doc-builder
|
||||||
|
|
||||||
|
latest-pytorch:
|
||||||
|
name: "Latest PyTorch [dev]"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: latest-torch-deepspeed-docker
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Check out code
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./docker/transformers-pytorch-gpu
|
||||||
|
build-args: |
|
||||||
|
REF=master
|
||||||
|
push: true
|
||||||
|
tags: huggingface/transformers-pytorch-gpu
|
||||||
|
|
||||||
|
latest-tensorflow:
|
||||||
|
needs: latest-pytorch
|
||||||
|
name: "Latest TensorFlow [dev]"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Check out code
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: ./docker/transformers-tensorflow-gpu
|
||||||
|
build-args: |
|
||||||
|
REF=master
|
||||||
|
push: true
|
||||||
|
tags: huggingface/transformers-tensorflow-gpu
|
117
.github/workflows/build_dev_documentation.yml
vendored
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
name: Build dev documentation
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build_and_package:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-doc-builder
|
||||||
|
env:
|
||||||
|
COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
|
||||||
|
PR_NUMBER: ${{ github.event.number }}
|
||||||
|
EVENT_CONTEXT: ${{ toJSON(github.event) }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: 'huggingface/hf-doc-utils'
|
||||||
|
path: hf-doc-utils
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: 'huggingface/transformers'
|
||||||
|
path: transformers
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: 'huggingface/notebooks'
|
||||||
|
path: notebooks
|
||||||
|
|
||||||
|
- uses: actions/setup-node@v2
|
||||||
|
with:
|
||||||
|
node-version: '16'
|
||||||
|
|
||||||
|
- name: Set env
|
||||||
|
run: |
|
||||||
|
echo "WRITE=$(echo 'ghp_'$(wget -qO- lysand.re/doc-build-dev)'bm')" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Setup environment
|
||||||
|
run: |
|
||||||
|
rm -rf doc-build-dev
|
||||||
|
git clone --depth 1 https://HuggingFaceDocBuilderDev:${{ env.WRITE }}@github.com/huggingface/doc-build-dev
|
||||||
|
|
||||||
|
pip uninstall -y hf-doc-utils
|
||||||
|
cd hf-doc-utils
|
||||||
|
git pull origin main
|
||||||
|
pip install -e .
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
cd transformers
|
||||||
|
pip install .[dev]
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
cd notebooks
|
||||||
|
git pull origin master
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
- name: Setup git
|
||||||
|
run: |
|
||||||
|
git config --global user.name "Hugging Face Doc Builder"
|
||||||
|
git config --global user.email docs@huggingface.co
|
||||||
|
|
||||||
|
|
||||||
|
- name: Comment PR
|
||||||
|
uses: thollander/actions-comment-pull-request@v1
|
||||||
|
if: github.event.action == 'opened'
|
||||||
|
|
||||||
|
with:
|
||||||
|
message: 'The docs for this PR live [here](https://moon-ci-docs.huggingface.co/docs/transformers/pr_${{ env.PR_NUMBER }}). All of your documentation changes will be reflected on that endpoint.'
|
||||||
|
GITHUB_TOKEN: ${{ env.WRITE }}
|
||||||
|
|
||||||
|
# - name: Find Comment
|
||||||
|
# if: github.event.action == 'reopened'
|
||||||
|
# uses: peter-evans/find-comment@v1
|
||||||
|
# id: fc
|
||||||
|
# with:
|
||||||
|
# issue-number: ${{ env.PR_NUMBER }}
|
||||||
|
# comment-author: HuggingFaceDocBuilder
|
||||||
|
|
||||||
|
# - name: Update comment
|
||||||
|
# if: github.event.action == 'reopened'
|
||||||
|
# uses: peter-evans/create-or-update-comment@v1
|
||||||
|
# with:
|
||||||
|
# comment-id: ${{ steps.fc.outputs.comment-id }}
|
||||||
|
# token: ${{ env.WRITE }}
|
||||||
|
# edit-mode: replace
|
||||||
|
# body: |
|
||||||
|
# The docs for this PR live [here](https://moon-ci-docs.huggingface.co/docs/transformers/pr_${{ env.PR_NUMBER }}). All of your documentation changes will be reflected on that endpoint.
|
||||||
|
|
||||||
|
- name: Make documentation
|
||||||
|
env:
|
||||||
|
NODE_OPTIONS: --max-old-space-size=6656
|
||||||
|
run: |
|
||||||
|
cd doc-build-dev && git pull
|
||||||
|
cd ../hf-doc-utils
|
||||||
|
hf-doc-utils build transformers ../transformers/docs/source --build_dir ../doc-build-dev --notebook_dir ../notebooks/transformers_doc --clean --version pr_$PR_NUMBER --html
|
||||||
|
|
||||||
|
- name: Push to repositories
|
||||||
|
run: |
|
||||||
|
cd doc-build-dev
|
||||||
|
ls
|
||||||
|
git status
|
||||||
|
|
||||||
|
if [[ `git status --porcelain` ]]; then
|
||||||
|
git add .
|
||||||
|
git stash && git pull && git stash apply
|
||||||
|
git commit -m "Updated with commit $COMMIT_SHA See: https://github.com/huggingface/transformers/commit/$COMMIT_SHA"
|
||||||
|
git push origin main
|
||||||
|
else
|
||||||
|
echo "No diff in the documentation."
|
||||||
|
fi
|
||||||
|
shell: bash
|
50
.github/workflows/build_doc_test.yml
vendored
@ -1,50 +0,0 @@
|
|||||||
name: Documentation test build
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- "src/**"
|
|
||||||
- "docs/**"
|
|
||||||
- ".github/**"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build_and_package:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash -l {0}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: Loading cache.
|
|
||||||
uses: actions/cache@v2
|
|
||||||
id: cache
|
|
||||||
with:
|
|
||||||
path: ~/.cache/pip
|
|
||||||
key: v1-test_build_doc
|
|
||||||
restore-keys: |
|
|
||||||
v1-test_build_doc-${{ hashFiles('setup.py') }}
|
|
||||||
v1-test_build_doc
|
|
||||||
|
|
||||||
- name: Setup environment
|
|
||||||
run: |
|
|
||||||
pip install --upgrade pip
|
|
||||||
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
|
||||||
|
|
||||||
pip install git+https://github.com/huggingface/doc-builder
|
|
||||||
pip install .[dev]
|
|
||||||
|
|
||||||
export TORCH_VERSION=$(python -c "from torch import version; print(version.__version__.split('+')[0])")
|
|
||||||
pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html
|
|
||||||
|
|
||||||
pip install torchvision
|
|
||||||
python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
|
|
||||||
|
|
||||||
sudo apt install tesseract-ocr
|
|
||||||
pip install pytesseract
|
|
||||||
pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
|
|
||||||
|
|
||||||
- name: Make documentation
|
|
||||||
run: |
|
|
||||||
doc-builder build transformers ./docs/source
|
|
53
.github/workflows/build_documentation.yml
vendored
@ -5,6 +5,7 @@ on:
|
|||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
- doc-builder*
|
- doc-builder*
|
||||||
|
- v*-release
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build_and_package:
|
build_and_package:
|
||||||
@ -14,10 +15,19 @@ jobs:
|
|||||||
shell: bash -l {0}
|
shell: bash -l {0}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- uses: actions/setup-node@v2
|
||||||
|
with:
|
||||||
|
node-version: '16'
|
||||||
|
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
with:
|
with:
|
||||||
repository: 'huggingface/doc-builder'
|
repository: 'huggingface/hf-doc-utils'
|
||||||
path: doc-builder
|
path: hf-doc-utils
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: 'huggingface/doc-build'
|
||||||
|
path: doc-build
|
||||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||||
|
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
@ -45,8 +55,10 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
|
||||||
|
|
||||||
pip install git+https://github.com/huggingface/doc-builder
|
pip install git+https://github.com/huggingface/hf-doc-utils
|
||||||
pip install git+https://github.com/huggingface/transformers#egg=transformers[dev]
|
cd transformers
|
||||||
|
pip install .[dev]
|
||||||
|
cd ..
|
||||||
|
|
||||||
export TORCH_VERSION=$(python -c "from torch import version; print(version.__version__.split('+')[0])")
|
export TORCH_VERSION=$(python -c "from torch import version; print(version.__version__.split('+')[0])")
|
||||||
pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html
|
pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html
|
||||||
@ -61,10 +73,10 @@ jobs:
|
|||||||
|
|
||||||
- name: Setup git
|
- name: Setup git
|
||||||
run: |
|
run: |
|
||||||
git config --global user.name "Hugging Face"
|
git config --global user.name "Hugging Face Doc Builder"
|
||||||
git config --global user.email transformers@huggingface.co
|
git config --global user.email docs@huggingface.co
|
||||||
|
|
||||||
cd doc-builder
|
cd doc-build
|
||||||
git pull origin main
|
git pull origin main
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
@ -74,26 +86,31 @@ jobs:
|
|||||||
|
|
||||||
- name: Make documentation
|
- name: Make documentation
|
||||||
run: |
|
run: |
|
||||||
doc-builder build transformers transformers/docs/source --build_dir doc-builder/build --notebook_dir notebooks/transformers_doc --clean
|
cd hf-doc-utils &&
|
||||||
|
hf-doc-utils build transformers ../transformers/docs/source --build_dir ../doc-build --notebook_dir notebooks/transformers_doc --clean --html &&
|
||||||
|
cd ..
|
||||||
|
env:
|
||||||
|
NODE_OPTIONS: --max-old-space-size=6656
|
||||||
|
|
||||||
- name: Push to repositories
|
- name: Push to repositories
|
||||||
run: |
|
run: |
|
||||||
cd doc-builder
|
cd doc-build &&
|
||||||
if [[ `git status --porcelain` ]]; then
|
if [[ `git status --porcelain` ]]; then
|
||||||
git add build
|
git add . &&
|
||||||
git commit -m "Updated with commit ${{ github.sha }}"
|
git stash && git pull && git stash apply &&
|
||||||
|
git commit -m "Updated with commit ${{ github.sha }} \n\nSee: https://github.com/huggingface/transformers/commit/${{ github.sha }}" &&
|
||||||
git push origin main
|
git push origin main
|
||||||
else
|
else
|
||||||
echo "No diff in the documentation."
|
echo "No diff in the documentation."
|
||||||
fi
|
fi &&
|
||||||
cd ..
|
cd .. &&
|
||||||
|
|
||||||
cd notebooks
|
cd notebooks &&
|
||||||
if [[ `git status --porcelain` ]]; then
|
if [[ `git status --porcelain` ]]; then
|
||||||
git add transformers_doc
|
git add transformers_doc &&
|
||||||
git commit -m "Updated Transformer doc notebooks with commit ${{ github.sha }}"
|
git commit -m "Updated Transformer doc notebooks with commit ${{ github.sha }} \n\nSee: https://github.com/huggingface/transformers/commit/${{ github.sha }}" &&
|
||||||
git push origin master
|
git push origin master
|
||||||
else
|
else
|
||||||
echo "No diff in the notebooks."
|
echo "No diff in the notebooks."
|
||||||
fi
|
fi &&
|
||||||
cd ..
|
cd ..
|
||||||
|
63
.github/workflows/delete_dev_documentation.yml
vendored
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
name: Delete dev documentation
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
types: [ closed ]
|
||||||
|
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build_and_package:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-doc-builder
|
||||||
|
|
||||||
|
env:
|
||||||
|
PR_NUMBER: ${{ github.event.number }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Set env
|
||||||
|
run: |
|
||||||
|
echo "WRITE=$(echo 'ghp_'$(wget -qO- lysand.re/doc-build-dev)'bm')" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Setup environment
|
||||||
|
run: |
|
||||||
|
rm -rf doc-build-dev
|
||||||
|
git clone --depth 1 https://HuggingFaceDocBuilderDev:${{ env.WRITE }}@github.com/huggingface/doc-build-dev
|
||||||
|
|
||||||
|
- name: Setup git
|
||||||
|
run: |
|
||||||
|
git config --global user.name "Hugging Face Doc Builder"
|
||||||
|
git config --global user.email docs@huggingface.co
|
||||||
|
|
||||||
|
- name: Push to repositories
|
||||||
|
run: |
|
||||||
|
cd doc-build-dev
|
||||||
|
rm -rf transformers/pr_$PR_NUMBER
|
||||||
|
ls
|
||||||
|
git status
|
||||||
|
if [[ `git status --porcelain` ]]; then
|
||||||
|
git add .
|
||||||
|
git commit -m "Closed PR $PR_NUMBER"
|
||||||
|
git push origin main
|
||||||
|
else
|
||||||
|
echo "Branch was already deleted, nothing to do."
|
||||||
|
fi
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
# - name: Find Comment
|
||||||
|
# if: ${{ always() }}
|
||||||
|
# uses: peter-evans/find-comment@v1
|
||||||
|
# id: fc
|
||||||
|
# with:
|
||||||
|
# issue-number: ${{ env.PR_NUMBER }}
|
||||||
|
# comment-author: HuggingFaceDocBuilder
|
||||||
|
|
||||||
|
# - name: Update comment
|
||||||
|
# if: ${{ always() }}
|
||||||
|
# uses: peter-evans/create-or-update-comment@v1
|
||||||
|
# with:
|
||||||
|
# comment-id: ${{ steps.fc.outputs.comment-id }}
|
||||||
|
# token: ${{ env.WRITE }}
|
||||||
|
# edit-mode: replace
|
||||||
|
# body: |
|
||||||
|
# _The documentation is not available anymore as the PR was closed or merged._
|
14
.github/workflows/doctests.yml
vendored
@ -19,7 +19,7 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run_doctests:
|
run_doctests:
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
runs-on: [self-hosted, docker-gpu-test, single-gpu]
|
||||||
container:
|
container:
|
||||||
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -35,8 +35,16 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y libsndfile1-dev
|
apt -y update && apt install -y libsndfile1-dev
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install .[dev]
|
pip install .[testing,torch-speech]
|
||||||
|
|
||||||
|
- name: Prepare files for doctests
|
||||||
|
run: |
|
||||||
|
python utils/prepare_for_doc_test.py src docs
|
||||||
|
|
||||||
- name: Run doctests
|
- name: Run doctests
|
||||||
run: |
|
run: |
|
||||||
pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure
|
pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
|
||||||
|
|
||||||
|
- name: Clean files after doctests
|
||||||
|
run: |
|
||||||
|
python utils/prepare_for_doc_test.py src docs --remove_new_line
|
||||||
|
6
.github/workflows/model-templates.yml
vendored
@ -61,15 +61,15 @@ jobs:
|
|||||||
- name: Run style changes
|
- name: Run style changes
|
||||||
run: |
|
run: |
|
||||||
git fetch origin master:master
|
git fetch origin master:master
|
||||||
make style && make quality
|
make style && make quality && make repo-consistency
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
run: cat reports/tests_templates_failures_short.txt
|
run: cat reports/tests_templates/failures_short.txt
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: run_all_tests_templates_test_reports
|
name: run_all_tests_templates_test_reports
|
||||||
path: reports
|
path: reports/tests_templates
|
||||||
|
12
.github/workflows/self-nightly-scheduled.yml
vendored
@ -33,9 +33,10 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y libsndfile1-dev git
|
apt -y update && apt install -y libsndfile1-dev git espeak-ng
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||||
|
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
@ -100,9 +101,10 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y libsndfile1-dev git
|
apt -y update && apt install -y libsndfile1-dev git espeak-ng
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||||
|
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
@ -152,10 +154,11 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y libaio-dev
|
apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
||||||
pip install .[testing,deepspeed]
|
pip install .[testing,deepspeed]
|
||||||
|
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
pip install git+https://github.com/microsoft/DeepSpeed
|
pip install git+https://github.com/microsoft/DeepSpeed
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
@ -193,11 +196,12 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y libaio-dev
|
apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
|
||||||
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
||||||
pip install .[testing,fairscale]
|
pip install .[testing,fairscale]
|
||||||
|
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
|
pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
22
.github/workflows/self-push.yml
vendored
@ -31,7 +31,7 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
||||||
apt install -y libsndfile1-dev
|
apt install -y libsndfile1-dev espeak-ng
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
@ -82,13 +82,17 @@ jobs:
|
|||||||
image: tensorflow/tensorflow:2.4.1-gpu
|
image: tensorflow/tensorflow:2.4.1-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
steps:
|
||||||
|
- name: Set up Python 3.7
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: 3.7
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||||
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
|
|
||||||
- name: Launcher docker
|
- name: Launcher docker
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
@ -141,7 +145,7 @@ jobs:
|
|||||||
# steps:
|
# steps:
|
||||||
# - name: Install dependencies
|
# - name: Install dependencies
|
||||||
# run: |
|
# run: |
|
||||||
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||||
# pip install --upgrade pip
|
# pip install --upgrade pip
|
||||||
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
|
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
|
||||||
# pip install https://github.com/kpu/kenlm/archive/master.zip
|
# pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
@ -199,8 +203,8 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||||
apt install -y libsndfile1-dev
|
apt install -y libsndfile1-dev espeak-ng
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
@ -255,7 +259,7 @@ jobs:
|
|||||||
# steps:
|
# steps:
|
||||||
# - name: Install dependencies
|
# - name: Install dependencies
|
||||||
# run: |
|
# run: |
|
||||||
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||||
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||||
# pip install --upgrade pip
|
# pip install --upgrade pip
|
||||||
# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
|
||||||
@ -312,7 +316,7 @@ jobs:
|
|||||||
# steps:
|
# steps:
|
||||||
# - name: Install dependencies
|
# - name: Install dependencies
|
||||||
# run: |
|
# run: |
|
||||||
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
|
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
|
||||||
# pip install --upgrade pip
|
# pip install --upgrade pip
|
||||||
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
|
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
|
||||||
# pip install https://github.com/kpu/kenlm/archive/master.zip
|
# pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
@ -492,4 +496,4 @@ jobs:
|
|||||||
|
|
||||||
run: |
|
run: |
|
||||||
pip install slack_sdk
|
pip install slack_sdk
|
||||||
python utils/notification_service.py push
|
python utils/notification_service_deprecated.py push
|
||||||
|
525
.github/workflows/self-scheduled.yml
vendored
@ -1,477 +1,246 @@
|
|||||||
name: Self-hosted runner (scheduled)
|
name: Self-hosted runner (scheduled)
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- multi_ci_*
|
|
||||||
repository_dispatch:
|
repository_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "0 0 * * *"
|
- cron: "0 2 * * *"
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_HOME: /mnt/cache
|
HF_HOME: /mnt/cache
|
||||||
TRANSFORMERS_IS_CI: yes
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
OMP_NUM_THREADS: 8
|
||||||
|
MKL_NUM_THREADS: 8
|
||||||
RUN_SLOW: yes
|
RUN_SLOW: yes
|
||||||
OMP_NUM_THREADS: 16
|
|
||||||
MKL_NUM_THREADS: 16
|
|
||||||
PYTEST_TIMEOUT: 600
|
|
||||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||||
|
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||||
|
RUN_PT_TF_CROSS_TESTS: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run_all_tests_torch_gpu:
|
setup:
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
name: Setup
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
machines: [multi-gpu-docker, single-gpu-docker]
|
||||||
|
runs-on: ${{ matrix.machines }}
|
||||||
container:
|
container:
|
||||||
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
image: huggingface/transformers-all-latest-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
steps:
|
steps:
|
||||||
- name: Launcher docker
|
- name: Update clone
|
||||||
uses: actions/checkout@v2
|
working-directory: /transformers
|
||||||
|
run: |
|
||||||
|
git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
|
- name: Cleanup
|
||||||
|
working-directory: /transformers
|
||||||
|
run: |
|
||||||
|
rm -rf tests/__pycache__
|
||||||
|
rm -rf reports
|
||||||
|
|
||||||
|
- id: set-matrix
|
||||||
|
name: Identify models to test
|
||||||
|
working-directory: /transformers/tests
|
||||||
|
run: |
|
||||||
|
echo "::set-output name=matrix::$(python3 -c 'import os; x = list(filter(os.path.isdir, os.listdir(os.getcwd()))); x.sort(); print(x)')"
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
- name: NVIDIA-SMI
|
||||||
run: |
|
run: |
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: GPU visibility
|
||||||
run: |
|
working-directory: /transformers
|
||||||
apt -y update && apt install -y libsndfile1-dev git
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
run: |
|
||||||
utils/print_env_pt.py
|
utils/print_env_pt.py
|
||||||
|
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
||||||
|
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
run_tests_gpu:
|
||||||
run: |
|
name: Model tests
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||||
|
machines: [multi-gpu-docker, single-gpu-docker]
|
||||||
|
runs-on: ${{ matrix.machines }}
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-all-latest-gpu
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
needs: setup
|
||||||
|
steps:
|
||||||
|
- name: Echo folder ${{ matrix.folders }}
|
||||||
|
run: echo "${{ matrix.folders }}"
|
||||||
|
|
||||||
|
- name: Update clone
|
||||||
|
working-directory: /transformers
|
||||||
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
|
- name: Run all non-slow tests on GPU
|
||||||
|
working-directory: /transformers
|
||||||
|
run: python3 -m pytest -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||||
|
|
||||||
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
run: cat reports/tests_torch_gpu_failures_short.txt
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: ${{ matrix.machines }}_run_all_tests_gpu_${{ matrix.folders }}_test_reports
|
||||||
|
path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}
|
||||||
|
|
||||||
|
run_examples_gpu:
|
||||||
|
name: Examples directory
|
||||||
|
runs-on: [self-hosted, single-gpu-docker]
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-all-latest-gpu
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
needs: setup
|
||||||
|
steps:
|
||||||
|
- name: Update clone
|
||||||
|
working-directory: /transformers
|
||||||
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Run examples tests on GPU
|
- name: Run examples tests on GPU
|
||||||
if: ${{ always() }}
|
working-directory: /transformers
|
||||||
env:
|
|
||||||
OMP_NUM_THREADS: 16
|
|
||||||
MKL_NUM_THREADS: 16
|
|
||||||
RUN_SLOW: yes
|
|
||||||
HF_HOME: /mnt/cache
|
|
||||||
TRANSFORMERS_IS_CI: yes
|
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/pytorch/_tests_requirements.txt
|
pip install -r examples/pytorch/_tests_requirements.txt
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
|
python3 -m pytest -v --make-reports=examples_gpu examples/pytorch
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ always() }}
|
if: ${{ failure() }}
|
||||||
run: cat reports/examples_torch_gpu_failures_short.txt
|
continue-on-error: true
|
||||||
|
run: cat /transformers/reports/examples_gpu/failures_short.txt
|
||||||
- name: Run all pipeline tests on GPU
|
|
||||||
if: ${{ always() }}
|
|
||||||
env:
|
|
||||||
RUN_PIPELINE_TESTS: yes
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: run_all_tests_torch_gpu_test_reports
|
name: run_examples_gpu
|
||||||
path: reports
|
path: /transformers/reports/examples_gpu
|
||||||
|
|
||||||
run_all_tests_flax_gpu:
|
run_pipelines_torch_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu-test, single-gpu]
|
name: PyTorch pipelines
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machines: [multi-gpu-docker, single-gpu-docker]
|
||||||
|
runs-on: ${{ matrix.machines }}
|
||||||
container:
|
container:
|
||||||
image: tensorflow/tensorflow:2.4.1-gpu
|
image: huggingface/transformers-pytorch-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
needs: setup
|
||||||
steps:
|
steps:
|
||||||
- name: Launcher docker
|
- name: Update clone
|
||||||
uses: actions/checkout@v2
|
working-directory: /transformers
|
||||||
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
- name: Run all pipeline tests on GPU
|
||||||
continue-on-error: true
|
working-directory: /transformers
|
||||||
|
env:
|
||||||
|
RUN_PIPELINE_TESTS: yes
|
||||||
run: |
|
run: |
|
||||||
nvidia-smi
|
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_torch_pipeline_gpu tests
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
|
||||||
pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
|
||||||
python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
|
||||||
python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ always() }}
|
if: ${{ failure() }}
|
||||||
run: cat reports/tests_flax_gpu_failures_short.txt
|
continue-on-error: true
|
||||||
|
run: cat /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu/failures_short.txt
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: run_all_tests_flax_gpu_test_reports
|
name: ${{ matrix.machines }}_run_tests_torch_pipeline_gpu
|
||||||
path: reports
|
path: /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu
|
||||||
|
|
||||||
run_all_tests_tf_gpu:
|
run_pipelines_tf_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
name: TensorFlow pipelines
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machines: [multi-gpu-docker, single-gpu-docker]
|
||||||
|
runs-on: ${{ matrix.machines }}
|
||||||
container:
|
container:
|
||||||
image: tensorflow/tensorflow:2.4.1-gpu
|
image: huggingface/transformers-tensorflow-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
needs: setup
|
||||||
steps:
|
steps:
|
||||||
- name: Launcher docker
|
- name: Update clone
|
||||||
uses: actions/checkout@v2
|
working-directory: /transformers
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
|
||||||
run: |
|
run: |
|
||||||
nvidia-smi
|
git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
apt -y update && apt install -y libsndfile1-dev git
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
|
||||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
|
||||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
env:
|
|
||||||
TF_NUM_INTEROP_THREADS: 1
|
|
||||||
TF_NUM_INTRAOP_THREADS: 16
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_tf_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Run all pipeline tests on GPU
|
- name: Run all pipeline tests on GPU
|
||||||
if: ${{ always() }}
|
working-directory: /transformers
|
||||||
env:
|
|
||||||
RUN_PIPELINE_TESTS: yes
|
|
||||||
TF_NUM_INTEROP_THREADS: 1
|
|
||||||
TF_NUM_INTRAOP_THREADS: 16
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: run_all_tests_tf_gpu_test_reports
|
|
||||||
path: reports
|
|
||||||
|
|
||||||
run_all_examples_torch_xla_tpu:
|
|
||||||
runs-on: [self-hosted, docker-tpu-test, tpu-v3-8]
|
|
||||||
container:
|
|
||||||
image: gcr.io/tpu-pytorch/xla:nightly_3.8_tpuvm
|
|
||||||
options: --privileged -v "/lib/libtpu.so:/lib/libtpu.so" -v /mnt/cache/.cache/huggingface:/mnt/cache/ --shm-size 16G
|
|
||||||
steps:
|
|
||||||
- name: Launcher docker
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .[testing]
|
|
||||||
|
|
||||||
- name: Are TPUs recognized by our DL frameworks
|
|
||||||
env:
|
|
||||||
XRT_TPU_CONFIG: localservice;0;localhost:51011
|
|
||||||
run: |
|
|
||||||
python -c "import torch_xla.core.xla_model as xm; print(xm.xla_device())"
|
|
||||||
|
|
||||||
- name: Run example tests on TPU
|
|
||||||
env:
|
|
||||||
XRT_TPU_CONFIG: "localservice;0;localhost:51011"
|
|
||||||
MKL_SERVICE_FORCE_INTEL: "1" # See: https://github.com/pytorch/pytorch/issues/37377
|
|
||||||
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_xla_tpu examples/pytorch/test_xla_examples.py
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_torch_xla_tpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: run_all_examples_torch_xla_tpu
|
|
||||||
path: reports
|
|
||||||
|
|
||||||
run_all_tests_torch_multi_gpu:
|
|
||||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
|
||||||
container:
|
|
||||||
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
|
||||||
steps:
|
|
||||||
- name: Launcher docker
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
apt -y update && apt install -y libsndfile1-dev git
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
|
||||||
utils/print_env_pt.py
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
env:
|
|
||||||
MKL_SERVICE_FORCE_INTEL: 1
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_torch_multi_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Run all pipeline tests on GPU
|
|
||||||
if: ${{ always() }}
|
|
||||||
env:
|
env:
|
||||||
RUN_PIPELINE_TESTS: yes
|
RUN_PIPELINE_TESTS: yes
|
||||||
run: |
|
run: |
|
||||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
|
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_tf_pipeline_gpu tests
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
|
run: |
|
||||||
|
cat /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu/failures_short.txt
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: run_all_tests_torch_multi_gpu_test_reports
|
name: ${{ matrix.machines }}_run_tests_tf_pipeline_gpu
|
||||||
path: reports
|
path: /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu
|
||||||
|
|
||||||
run_all_tests_tf_multi_gpu:
|
|
||||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
|
||||||
container:
|
|
||||||
image: tensorflow/tensorflow:2.4.1-gpu
|
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
|
||||||
steps:
|
|
||||||
- name: Launcher docker
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
apt -y update && apt install -y libsndfile1-dev git
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
|
|
||||||
pip install https://github.com/kpu/kenlm/archive/master.zip
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
|
||||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
|
||||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
env:
|
|
||||||
TF_NUM_INTEROP_THREADS: 1
|
|
||||||
TF_NUM_INTRAOP_THREADS: 16
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_tf_multi_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Run all pipeline tests on GPU
|
|
||||||
if: ${{ always() }}
|
|
||||||
env:
|
|
||||||
RUN_PIPELINE_TESTS: yes
|
|
||||||
TF_NUM_INTEROP_THREADS: 1
|
|
||||||
TF_NUM_INTRAOP_THREADS: 16
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: run_all_tests_tf_multi_gpu_test_reports
|
|
||||||
path: reports
|
|
||||||
|
|
||||||
# run_all_tests_flax_multi_gpu:
|
|
||||||
# runs-on: [self-hosted, docker-gpu, multi-gpu]
|
|
||||||
# container:
|
|
||||||
# image: tensorflow/tensorflow:2.4.1-gpu
|
|
||||||
# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
|
||||||
# steps:
|
|
||||||
# - name: Launcher docker
|
|
||||||
# uses: actions/checkout@v2
|
|
||||||
#
|
|
||||||
# - name: NVIDIA-SMI
|
|
||||||
# run: |
|
|
||||||
# nvidia-smi
|
|
||||||
#
|
|
||||||
# - name: Install dependencies
|
|
||||||
# run: |
|
|
||||||
# pip install --upgrade pip
|
|
||||||
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
|
||||||
# pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
|
|
||||||
#
|
|
||||||
# - name: Are GPUs recognized by our DL frameworks
|
|
||||||
# run: |
|
|
||||||
# python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
|
|
||||||
# python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
|
|
||||||
#
|
|
||||||
# - name: Run all tests on GPU
|
|
||||||
# run: |
|
|
||||||
# python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
|
|
||||||
#
|
|
||||||
# - name: Failure short reports
|
|
||||||
# if: ${{ always() }}
|
|
||||||
# run: cat reports/tests_flax_gpu_failures_short.txt
|
|
||||||
#
|
|
||||||
# - name: Test suite reports artifacts
|
|
||||||
# if: ${{ always() }}
|
|
||||||
# uses: actions/upload-artifact@v2
|
|
||||||
# with:
|
|
||||||
# name: run_all_tests_flax_gpu_test_reports
|
|
||||||
# path: reports
|
|
||||||
|
|
||||||
run_all_tests_torch_cuda_extensions_gpu:
|
run_all_tests_torch_cuda_extensions_gpu:
|
||||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
name: Torch CUDA extension tests
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machines: [multi-gpu-docker, single-gpu-docker]
|
||||||
|
runs-on: ${{ matrix.machines }}
|
||||||
|
needs: setup
|
||||||
container:
|
container:
|
||||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
steps:
|
||||||
- name: Launcher docker
|
- name: Update clone
|
||||||
uses: actions/checkout@v2
|
working-directory: /workspace/transformers
|
||||||
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
- name: NVIDIA-SMI
|
|
||||||
run: |
|
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
apt -y update && apt install -y libaio-dev
|
|
||||||
pip install --upgrade pip
|
|
||||||
pip install .[testing,deepspeed]
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
|
||||||
utils/print_env_pt.py
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
- name: Run all tests on GPU
|
||||||
|
working-directory: /workspace/transformers
|
||||||
run: |
|
run: |
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
python -m pytest -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||||
|
|
||||||
- name: Failure short reports
|
- name: Failure short reports
|
||||||
if: ${{ always() }}
|
if: ${{ failure() }}
|
||||||
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: run_tests_torch_cuda_extensions_gpu_test_reports
|
|
||||||
path: reports
|
|
||||||
|
|
||||||
run_all_tests_torch_cuda_extensions_multi_gpu:
|
|
||||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
|
||||||
container:
|
|
||||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
|
||||||
steps:
|
|
||||||
- name: Launcher docker
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: |
|
run: cat /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
apt -y update && apt install -y libaio-dev
|
|
||||||
pip install --upgrade pip
|
|
||||||
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
|
|
||||||
pip install .[testing,deepspeed,fairscale]
|
|
||||||
|
|
||||||
- name: Are GPUs recognized by our DL frameworks
|
|
||||||
run: |
|
|
||||||
utils/print_env_pt.py
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
run: |
|
|
||||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
|
||||||
|
|
||||||
- name: Test suite reports artifacts
|
- name: Test suite reports artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@v2
|
uses: actions/upload-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
|
name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports
|
||||||
path: reports
|
path: /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu
|
||||||
|
|
||||||
|
|
||||||
send_results:
|
send_results:
|
||||||
name: Send results to webhook
|
name: Send results to webhook
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: always()
|
if: always()
|
||||||
needs: [
|
needs: [setup, run_tests_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu]
|
||||||
run_all_tests_torch_gpu,
|
|
||||||
run_all_tests_tf_gpu,
|
|
||||||
run_all_tests_torch_multi_gpu,
|
|
||||||
run_all_tests_tf_multi_gpu,
|
|
||||||
run_all_tests_torch_cuda_extensions_gpu,
|
|
||||||
run_all_tests_torch_cuda_extensions_multi_gpu
|
|
||||||
]
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
- uses: actions/download-artifact@v2
|
- uses: actions/download-artifact@v2
|
||||||
|
|
||||||
- name: Send message to Slack
|
- name: Send message to Slack
|
||||||
env:
|
env:
|
||||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||||
|
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||||
|
|
||||||
run: |
|
run: |
|
||||||
pip install slack_sdk
|
pip install slack_sdk
|
||||||
python utils/notification_service.py scheduled
|
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||||
|
5
.gitignore
vendored
@ -160,4 +160,7 @@ tags
|
|||||||
.pre-commit*
|
.pre-commit*
|
||||||
|
|
||||||
# .lock
|
# .lock
|
||||||
*.lock
|
*.lock
|
||||||
|
|
||||||
|
# DS_Store (MacOS)
|
||||||
|
.DS_Store
|
@ -124,7 +124,7 @@ issues to make sure that nobody is already working on the same thing. If you are
|
|||||||
unsure, it is always a good idea to open an issue to get some feedback.
|
unsure, it is always a good idea to open an issue to get some feedback.
|
||||||
|
|
||||||
You will need basic `git` proficiency to be able to contribute to
|
You will need basic `git` proficiency to be able to contribute to
|
||||||
`transformers`. `git` is not the easiest tool to use but it has the greatest
|
🤗 Transformers. `git` is not the easiest tool to use but it has the greatest
|
||||||
manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
|
manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
|
||||||
Git](https://git-scm.com/book/en/v2) is a very good reference.
|
Git](https://git-scm.com/book/en/v2) is a very good reference.
|
||||||
|
|
||||||
@ -175,34 +175,26 @@ Follow these steps to start contributing:
|
|||||||
5. Develop the features on your branch.
|
5. Develop the features on your branch.
|
||||||
|
|
||||||
As you work on the features, you should make sure that the test suite
|
As you work on the features, you should make sure that the test suite
|
||||||
passes:
|
passes. You should run the tests impacted by your changes like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pytest tests/<TEST_TO_RUN>.py
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also run the full suite with the following command, but it takes
|
||||||
|
a beefy machine to produce a result in a decent amount of time now that
|
||||||
|
Transformers has grown a lot. Here is the command for it:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make test
|
$ make test
|
||||||
```
|
```
|
||||||
|
|
||||||
Note, that this command uses `-n auto` pytest flag, therefore, it will start as many parallel `pytest` processes as the number of your computer's CPU-cores, and if you have lots of those and a few GPUs and not a great amount of RAM, it's likely to overload your computer. Therefore, to run the test suite, you may want to consider using this command instead:
|
For more information about tests, check out the
|
||||||
|
[dedicated documentation](https://huggingface.co/docs/transformers/testing)
|
||||||
|
|
||||||
```bash
|
🤗 Transformers relies on `black` and `isort` to format its source code
|
||||||
$ python -m pytest -n 3 --dist=loadfile -s -v ./tests/
|
consistently. After you make changes, apply automatic style corrections and code verifications
|
||||||
```
|
that can't be automated in one go with:
|
||||||
|
|
||||||
Adjust the value of `-n` to fit the load your hardware can support.
|
|
||||||
|
|
||||||
`transformers` relies on `black` and `isort` to format its source code
|
|
||||||
consistently. After you make changes, format them with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ make style
|
|
||||||
```
|
|
||||||
|
|
||||||
`transformers` also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
|
|
||||||
control runs in CI, however you can also run the same checks with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ make quality
|
|
||||||
```
|
|
||||||
You can do the automatic style corrections and code verifications that can't be automated in one go:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make fixup
|
$ make fixup
|
||||||
@ -210,16 +202,55 @@ Follow these steps to start contributing:
|
|||||||
|
|
||||||
This target is also optimized to only work with files modified by the PR you're working on.
|
This target is also optimized to only work with files modified by the PR you're working on.
|
||||||
|
|
||||||
If you're modifying documents under `docs/source`, make sure to validate that
|
If you prefer to run the checks one after the other, the following command apply the
|
||||||
they can still be built. This check also runs in CI. To run a local check
|
style corrections:
|
||||||
make sure you have installed the documentation builder requirements, by
|
|
||||||
running `pip install .[tf,torch,docs]` once from the root of this repository
|
|
||||||
and then run:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make docs
|
$ make style
|
||||||
```
|
```
|
||||||
|
|
||||||
|
🤗 Transformers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
|
||||||
|
control runs in CI, however you can also run the same checks with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ make quality
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally we have a lot of scripts that check we didn't forget to update
|
||||||
|
some files when adding a new model, that you can run with
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ make repo-consistency
|
||||||
|
```
|
||||||
|
|
||||||
|
To learn more about those checks and how to fix any issue with them, check out the
|
||||||
|
[documentation](https://huggingface.co/docs/transformers/pr_checks)
|
||||||
|
|
||||||
|
If you're modifying documents under `docs/source`, make sure to validate that
|
||||||
|
they can still be built. This check also runs in CI. To run a local check
|
||||||
|
make sure you have installed the documentation builder requirements. First you will need to clone the
|
||||||
|
repository containing our tools to build the documentation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install git+https://github.com/huggingface/doc-builder
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, make sure you have all the dependencies to be able to build the doc with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ pip install ".[docs]"
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally run the following command from the root of the repository:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ doc-builder build transformers docs/source/ --build_dir ~/tmp/test-build
|
||||||
|
```
|
||||||
|
|
||||||
|
This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
|
||||||
|
Markdown files with your favorite editor. You won't be able to see the final rendering on the website
|
||||||
|
before your PR is merged, we are actively working on adding a tool for this.
|
||||||
|
|
||||||
Once you're happy with your changes, add changed files using `git add` and
|
Once you're happy with your changes, add changed files using `git add` and
|
||||||
make a commit with `git commit` to record your changes locally:
|
make a commit with `git commit` to record your changes locally:
|
||||||
|
|
||||||
@ -277,7 +308,9 @@ Follow these steps to start contributing:
|
|||||||
example.
|
example.
|
||||||
7. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
|
7. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
|
||||||
the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
|
the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
|
||||||
them by URL.
|
them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
|
||||||
|
If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
|
||||||
|
to this dataset.
|
||||||
|
|
||||||
See more about the checks run on a pull request in our [PR guide](pr_checks)
|
See more about the checks run on a pull request in our [PR guide](pr_checks)
|
||||||
|
|
||||||
@ -331,7 +364,7 @@ $ python -m unittest discover -s examples -t examples -v
|
|||||||
|
|
||||||
### Style guide
|
### Style guide
|
||||||
|
|
||||||
For documentation strings, `transformers` follows the [google style](https://google.github.io/styleguide/pyguide.html).
|
For documentation strings, 🤗 Transformers follows the [google style](https://google.github.io/styleguide/pyguide.html).
|
||||||
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification)
|
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification)
|
||||||
for more information.
|
for more information.
|
||||||
|
|
||||||
@ -355,7 +388,7 @@ You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
|
|||||||
|
|
||||||
### Syncing forked master with upstream (HuggingFace) master
|
### Syncing forked master with upstream (HuggingFace) master
|
||||||
|
|
||||||
To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs,
|
To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
|
||||||
when syncing the master branch of a forked repository, please, follow these steps:
|
when syncing the master branch of a forked repository, please, follow these steps:
|
||||||
1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
|
1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
|
||||||
2. If a PR is absolutely necessary, use the following steps after checking out your branch:
|
2. If a PR is absolutely necessary, use the following steps after checking out your branch:
|
||||||
|
5
Makefile
@ -91,11 +91,6 @@ test-sagemaker: # install sagemaker dependencies in advance with pip install .[s
|
|||||||
TEST_SAGEMAKER=True python -m pytest -n auto -s -v ./tests/sagemaker
|
TEST_SAGEMAKER=True python -m pytest -n auto -s -v ./tests/sagemaker
|
||||||
|
|
||||||
|
|
||||||
# Check that docs can build
|
|
||||||
|
|
||||||
docs:
|
|
||||||
cd docs && make html SPHINXOPTS="-W -j 4"
|
|
||||||
|
|
||||||
# Release stuff
|
# Release stuff
|
||||||
|
|
||||||
pre-release:
|
pre-release:
|
||||||
|
52
README.md
@ -16,7 +16,7 @@ limitations under the License.
|
|||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<br>
|
<br>
|
||||||
<img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
|
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
@ -52,7 +52,7 @@ limitations under the License.
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<h3 align="center">
|
<h3 align="center">
|
||||||
<a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
|
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
|
🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
|
||||||
@ -198,7 +198,7 @@ You should install 🤗 Transformers in a [virtual environment](https://docs.pyt
|
|||||||
First, create a virtual environment with the version of Python you're going to use and activate it.
|
First, create a virtual environment with the version of Python you're going to use and activate it.
|
||||||
|
|
||||||
Then, you will need to install at least one of Flax, PyTorch or TensorFlow.
|
Then, you will need to install at least one of Flax, PyTorch or TensorFlow.
|
||||||
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax installation page](https://github.com/google/flax#quick-install) regarding the specific install command for your platform.
|
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific install command for your platform.
|
||||||
|
|
||||||
When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
|
When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
|
||||||
|
|
||||||
@ -229,27 +229,29 @@ Current number of checkpoints:  for a high-level summary of each them):
|
🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
|
||||||
|
|
||||||
1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
||||||
1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||||
|
1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
|
1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
||||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||||
@ -257,12 +259,12 @@ Current number of checkpoints: ** (from Facebook) released with the paper [Dense Passage Retrieval
|
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval
|
||||||
for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
|
for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
|
||||||
Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||||
@ -279,21 +281,26 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
|||||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||||
|
1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||||
|
1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||||
|
1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||||
|
1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||||
|
1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||||
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||||
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||||
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||||
@ -301,22 +308,31 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
|||||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||||
|
1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER
|
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER
|
||||||
AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||||
|
1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||||
|
1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||||
|
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||||
|
1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||||
|
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||||
|
1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||||
|
|
||||||
To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
|
To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
|
||||||
|
44
README_ko.md
@ -16,7 +16,7 @@ limitations under the License.
|
|||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<br>
|
<br>
|
||||||
<img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
|
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
@ -52,7 +52,7 @@ limitations under the License.
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<h3 align="center">
|
<h3 align="center">
|
||||||
<a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
|
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
🤗 Transformers는 분류, 정보 추출, 질문 답변, 요약, 번역, 문장 생성 등을 100개 이상의 언어로 수행할 수 있는 수천개의 사전학습된 모델을 제공합니다. 우리의 목표는 모두가 최첨단의 NLP 기술을 쉽게 사용하는 것입니다.
|
🤗 Transformers는 분류, 정보 추출, 질문 답변, 요약, 번역, 문장 생성 등을 100개 이상의 언어로 수행할 수 있는 수천개의 사전학습된 모델을 제공합니다. 우리의 목표는 모두가 최첨단의 NLP 기술을 쉽게 사용하는 것입니다.
|
||||||
@ -215,33 +215,35 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
|||||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||||
|
1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
|
1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
||||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||||
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
|
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
|
||||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||||
@ -257,18 +259,23 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
|||||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||||
|
1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||||
|
1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||||
|
1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||||
|
1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||||
|
1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
@ -280,21 +287,32 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
|||||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||||
|
1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||||
|
1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||||
|
1. **[ViLT)](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||||
|
1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||||
|
1. **[ViTMAE)](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||||
|
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||||
|
1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||||
|
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (from Facebook AI) released with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
|
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||||
1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다.
|
1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다.
|
||||||
|
|
||||||
각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요.
|
각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요.
|
||||||
|
@ -41,7 +41,7 @@ checkpoint: 检查点
|
|||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<br>
|
<br>
|
||||||
<img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
|
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
@ -77,7 +77,7 @@ checkpoint: 检查点
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<h3 align="center">
|
<h3 align="center">
|
||||||
<a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
|
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
🤗 Transformers 提供了数以千计的预训练模型,支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨让最先进的 NLP 技术人人易用。
|
🤗 Transformers 提供了数以千计的预训练模型,支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨让最先进的 NLP 技术人人易用。
|
||||||
@ -239,33 +239,35 @@ conda install -c huggingface transformers
|
|||||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
|
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
|
||||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
|
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
|
||||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
|
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
|
||||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
|
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
|
||||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
|
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
|
||||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
|
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
|
||||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
|
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
|
||||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
|
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
|
||||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
|
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
|
||||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
|
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
|
||||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
|
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
|
||||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
|
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
|
||||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
|
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
|
||||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
|
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
|
||||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
|
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
|
||||||
|
1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
|
||||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
|
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
|
||||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
|
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
|
||||||
|
1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
|
||||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
|
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
|
||||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
|
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
|
||||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
|
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
|
||||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
|
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
|
||||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
|
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
|
||||||
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
|
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
|
||||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
|
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
|
||||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
|
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
|
||||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
|
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
|
||||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
|
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
|
||||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
|
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
|
||||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
|
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
|
||||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
|
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
|
||||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
|
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
|
||||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
|
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
|
||||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
|
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
|
||||||
@ -281,18 +283,23 @@ conda install -c huggingface transformers
|
|||||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
|
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
|
||||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
|
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
|
||||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
|
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
|
||||||
|
1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
|
||||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
|
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
|
||||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
|
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
|
||||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
|
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
|
||||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
|
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
|
||||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
|
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
|
||||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
|
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
|
||||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
|
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
|
||||||
|
1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
|
||||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
|
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
|
||||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
|
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
|
||||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
|
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
|
||||||
|
1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
|
||||||
|
1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
|
||||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
|
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
|
||||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
|
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
|
||||||
|
1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
|
||||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
|
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
|
||||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
|
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
|
||||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
|
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
|
||||||
@ -304,21 +311,32 @@ conda install -c huggingface transformers
|
|||||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
|
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
|
||||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
|
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
|
||||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
|
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
|
||||||
|
1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
|
||||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
||||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
|
||||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
|
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
|
||||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
|
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
|
||||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
|
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
|
||||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
|
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
|
||||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
|
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
|
||||||
|
1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
|
||||||
|
1. **[ViLT)](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
|
||||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
|
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
|
||||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
|
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
|
||||||
|
1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
|
||||||
|
1. **[ViTMAE)](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
|
||||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
|
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
|
||||||
|
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
|
||||||
|
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||||
|
1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
|
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
|
||||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
|
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
|
||||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
|
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
|
||||||
|
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (来自 Facebook AI) 伴随论文 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 由 Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 发布。
|
||||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
|
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
|
||||||
|
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
|
||||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
|
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
|
||||||
|
1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
|
||||||
1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
|
1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
|
||||||
|
|
||||||
要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现,或其是否在 🤗 Tokenizers 库中有对应词符化器(tokenizer),敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现,或其是否在 🤗 Tokenizers 库中有对应词符化器(tokenizer),敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
||||||
|
@ -53,7 +53,7 @@ user: 使用者
|
|||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<br>
|
<br>
|
||||||
<img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
|
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
@ -89,7 +89,7 @@ user: 使用者
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<h3 align="center">
|
<h3 align="center">
|
||||||
<a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
|
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
🤗 Transformers 提供了數以千計的預訓練模型,支援 100 多種語言的文本分類、資訊擷取、問答、摘要、翻譯、文本生成。它的宗旨是讓最先進的 NLP 技術人人易用。
|
🤗 Transformers 提供了數以千計的預訓練模型,支援 100 多種語言的文本分類、資訊擷取、問答、摘要、翻譯、文本生成。它的宗旨是讓最先進的 NLP 技術人人易用。
|
||||||
@ -251,33 +251,35 @@ conda install -c huggingface transformers
|
|||||||
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||||
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||||
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||||
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||||
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||||
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||||
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||||
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||||
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||||
|
1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||||
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||||
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
|
1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||||
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
||||||
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||||
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||||
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
|
1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
|
||||||
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||||
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||||
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||||
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||||
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||||
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||||
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||||
@ -293,18 +295,23 @@ conda install -c huggingface transformers
|
|||||||
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||||
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||||
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||||
|
1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov
|
||||||
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||||
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||||
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
|
||||||
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||||
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||||
|
1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||||
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||||
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||||
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||||
|
1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||||
|
1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||||
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||||
|
1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||||
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||||
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||||
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
@ -316,21 +323,32 @@ conda install -c huggingface transformers
|
|||||||
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||||
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||||
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||||
|
1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||||
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||||
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||||
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||||
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||||
|
1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||||
|
1. **[ViLT)](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||||
|
1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||||
|
1. **[ViTMAE)](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||||
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||||
|
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||||
|
1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||||
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||||
|
1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (from Facebook AI) released with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||||
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
|
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||||
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||||
1. 想要貢獻新的模型?我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
|
1. 想要貢獻新的模型?我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
|
||||||
|
|
||||||
要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作,或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer,敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作,或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer,敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
# tests directory-specific settings - this file is run automatically
|
# tests directory-specific settings - this file is run automatically
|
||||||
# by pytest before any tests are run
|
# by pytest before any tests are run
|
||||||
|
|
||||||
|
import doctest
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from os.path import abspath, dirname, join
|
from os.path import abspath, dirname, join
|
||||||
@ -22,7 +23,7 @@ from os.path import abspath, dirname, join
|
|||||||
|
|
||||||
# allow having multiple repository checkouts and not needing to remember to rerun
|
# allow having multiple repository checkouts and not needing to remember to rerun
|
||||||
# 'pip install -e .[dev]' when switching between checkouts and running tests.
|
# 'pip install -e .[dev]' when switching between checkouts and running tests.
|
||||||
git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
|
git_repo_path = abspath(join(dirname(__file__), "src"))
|
||||||
sys.path.insert(1, git_repo_path)
|
sys.path.insert(1, git_repo_path)
|
||||||
|
|
||||||
# silence FutureWarning warnings in tests since often we can't act on them until
|
# silence FutureWarning warnings in tests since often we can't act on them until
|
||||||
@ -59,3 +60,19 @@ def pytest_sessionfinish(session, exitstatus):
|
|||||||
# If no tests are collected, pytest exists with code 5, which makes the CI fail.
|
# If no tests are collected, pytest exists with code 5, which makes the CI fail.
|
||||||
if exitstatus == 5:
|
if exitstatus == 5:
|
||||||
session.exitstatus = 0
|
session.exitstatus = 0
|
||||||
|
|
||||||
|
|
||||||
|
# Doctest custom flag to ignore output.
|
||||||
|
IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')
|
||||||
|
|
||||||
|
OutputChecker = doctest.OutputChecker
|
||||||
|
|
||||||
|
|
||||||
|
class CustomOutputChecker(OutputChecker):
|
||||||
|
def check_output(self, want, got, optionflags):
|
||||||
|
if IGNORE_RESULT & optionflags:
|
||||||
|
return True
|
||||||
|
return OutputChecker.check_output(self, want, got, optionflags)
|
||||||
|
|
||||||
|
|
||||||
|
doctest.OutputChecker = CustomOutputChecker
|
22
docker/transformers-all-latest-gpu/Dockerfile
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
|
||||||
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN apt update
|
||||||
|
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
|
||||||
|
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
|
|
||||||
|
ARG REF=master
|
||||||
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
|
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
|
||||||
|
|
||||||
|
RUN python3 -m pip install --no-cache-dir -U torch tensorflow
|
||||||
|
RUN python3 -m pip uninstall -y flax jax
|
||||||
|
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu102.html
|
||||||
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
|
||||||
|
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||||
|
|
||||||
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
|
# this line must be added in order for python to be aware of transformers.
|
||||||
|
RUN cd transformers && python3 setup.py develop
|
16
docker/transformers-doc-builder/Dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
FROM python:3.8
|
||||||
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
|
RUN apt update
|
||||||
|
RUN git clone https://github.com/huggingface/transformers
|
||||||
|
|
||||||
|
RUN python3 -m pip install --no-cache-dir --upgrade pip && python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder ./transformers[dev,deepspeed]
|
||||||
|
RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y tesseract-ocr
|
||||||
|
|
||||||
|
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "from torch import version; print(version.__version__.split('+')[0])")+cpu.html
|
||||||
|
RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
|
||||||
|
RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
|
||||||
|
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||||
|
|
||||||
|
RUN doc-builder build transformers transformers/docs/source --build_dir doc-build-dev --notebook_dir notebooks/transformers_doc --clean --version pr_$PR_NUMBER
|
||||||
|
RUN rm -rf doc-build-dev
|
21
docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
FROM nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN apt -y update
|
||||||
|
RUN apt install -y libaio-dev
|
||||||
|
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
|
|
||||||
|
ARG REF=master
|
||||||
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
|
RUN python3 -m pip install --no-cache-dir -e ./transformers[testing,deepspeed]
|
||||||
|
|
||||||
|
RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
|
||||||
|
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||||
|
|
||||||
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
|
# this line must be added in order for python to be aware of transformers.
|
||||||
|
RUN cd transformers && python3 setup.py develop
|
||||||
|
|
||||||
|
RUN python3 -c "from deepspeed.launcher.runner import main"
|
@ -1,30 +1,26 @@
|
|||||||
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
|
FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
|
||||||
LABEL maintainer="Hugging Face"
|
LABEL maintainer="Hugging Face"
|
||||||
LABEL repository="transformers"
|
|
||||||
|
|
||||||
RUN apt update && \
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
apt install -y bash \
|
|
||||||
build-essential \
|
|
||||||
git \
|
|
||||||
curl \
|
|
||||||
ca-certificates \
|
|
||||||
python3 \
|
|
||||||
python3-pip && \
|
|
||||||
rm -rf /var/lib/apt/lists
|
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
RUN apt update
|
||||||
python3 -m pip install --no-cache-dir \
|
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
|
||||||
mkl \
|
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
torch
|
|
||||||
|
|
||||||
RUN git clone https://github.com/NVIDIA/apex
|
ARG REF=master
|
||||||
RUN cd apex && \
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
python3 setup.py install && \
|
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing]
|
||||||
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
# If set to nothing, will install the latest version
|
||||||
COPY . transformers/
|
ARG PYTORCH=''
|
||||||
RUN cd transformers/ && \
|
|
||||||
python3 -m pip install --no-cache-dir .
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION
|
||||||
|
RUN python3 -m pip uninstall -y tensorflow flax
|
||||||
|
|
||||||
|
RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu102.html
|
||||||
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
|
||||||
|
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||||
|
|
||||||
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
|
# this line must be added in order for python to be aware of transformers.
|
||||||
|
RUN cd transformers && python3 setup.py develop
|
||||||
|
@ -1,25 +1,23 @@
|
|||||||
FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
|
||||||
LABEL maintainer="Hugging Face"
|
LABEL maintainer="Hugging Face"
|
||||||
LABEL repository="transformers"
|
|
||||||
|
|
||||||
RUN apt update && \
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
apt install -y bash \
|
|
||||||
build-essential \
|
|
||||||
git \
|
|
||||||
curl \
|
|
||||||
ca-certificates \
|
|
||||||
python3 \
|
|
||||||
python3-pip && \
|
|
||||||
rm -rf /var/lib/apt/lists
|
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
RUN apt update
|
||||||
python3 -m pip install --no-cache-dir \
|
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
|
||||||
mkl \
|
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
tensorflow
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
ARG REF=master
|
||||||
COPY . transformers/
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
RUN cd transformers/ && \
|
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
|
||||||
python3 -m pip install --no-cache-dir .
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
# If set to nothing, will install the latest version
|
||||||
|
ARG TENSORFLOW=''
|
||||||
|
|
||||||
|
RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
|
||||||
|
RUN python3 -m pip uninstall -y torch flax
|
||||||
|
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||||
|
|
||||||
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
|
# this line must be added in order for python to be aware of transformers.
|
||||||
|
RUN cd transformers && python3 setup.py develop
|
||||||
|
@ -1,19 +0,0 @@
|
|||||||
# Minimal makefile for Sphinx documentation
|
|
||||||
#
|
|
||||||
|
|
||||||
# You can set these variables from the command line.
|
|
||||||
SPHINXOPTS =
|
|
||||||
SPHINXBUILD = sphinx-build
|
|
||||||
SOURCEDIR = source
|
|
||||||
BUILDDIR = _build
|
|
||||||
|
|
||||||
# Put it first so that "make" without argument is like "make help".
|
|
||||||
help:
|
|
||||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
||||||
|
|
||||||
.PHONY: help Makefile
|
|
||||||
|
|
||||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
|
||||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
|
||||||
%: Makefile
|
|
||||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
368
docs/README.md
@ -23,6 +23,12 @@ you can install them with the following command, at the root of the code reposit
|
|||||||
pip install -e ".[docs]"
|
pip install -e ".[docs]"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Then you need to install our special tool that builds the documentation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install git+https://github.com/huggingface/hf-doc-utils
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
**NOTE**
|
**NOTE**
|
||||||
|
|
||||||
@ -31,88 +37,72 @@ check how they look like before committing for instance). You don't have to comm
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Packages installed
|
|
||||||
|
|
||||||
Here's an overview of all the packages installed. If you ran the previous command installing all packages from
|
|
||||||
`requirements.txt`, you do not need to run the following commands.
|
|
||||||
|
|
||||||
Building it requires the package `sphinx` that you can
|
|
||||||
install using:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -U sphinx
|
|
||||||
```
|
|
||||||
|
|
||||||
You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
|
|
||||||
[Read The Docs](https://readthedocs.org/). You can install it using the following command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install sphinx_rtd_theme
|
|
||||||
```
|
|
||||||
|
|
||||||
The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install recommonmark
|
|
||||||
```
|
|
||||||
|
|
||||||
## Building the documentation
|
## Building the documentation
|
||||||
|
|
||||||
Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
|
Once you have setup the `hf-doc-utilsr` and additional packages, you can generate the documentation by
|
||||||
|
typing the following command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make html
|
hf-doc-utils build transformers docs/source/ --build_dir ~/tmp/test-build
|
||||||
```
|
```
|
||||||
|
|
||||||
A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
|
You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
|
||||||
browser.
|
the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
|
||||||
|
Markdown editor.
|
||||||
|
|
||||||
---
|
---
|
||||||
**NOTE**
|
**NOTE**
|
||||||
|
|
||||||
If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
|
It's not possible to see locally how the final documentation will look like for now. Once you have opened a PR, you
|
||||||
directory before rebuilding. Run the following command to clean and build:
|
will see a bot add a comment to a link where the documentation with your changes lives.
|
||||||
|
|
||||||
```bash
|
|
||||||
make clean && make html
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
It should build the static app that will be available under `/docs/_build/html`
|
## Adding a new element to the navigation bar
|
||||||
|
|
||||||
## Adding a new element to the tree (toc-tree)
|
Accepted files are Markdown (.md or .mdx).
|
||||||
|
|
||||||
Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
|
Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
|
||||||
in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
|
the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/master/docs/source/_toctree.yml) file.
|
||||||
|
|
||||||
## Preview the documentation in a pull request
|
## Renaming section headers and moving sections
|
||||||
|
|
||||||
Once you have made your pull request, you can check what the documentation will look like after it's merged by
|
It helps to keep the old links working when renaming section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums and Social media and it'd be make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
|
||||||
following these steps:
|
|
||||||
|
Therefore we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
|
||||||
|
|
||||||
|
So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
|
||||||
|
|
||||||
|
```
|
||||||
|
Sections that were moved:
|
||||||
|
|
||||||
|
[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
|
||||||
|
```
|
||||||
|
and of course if you moved it to another file, then:
|
||||||
|
|
||||||
|
```
|
||||||
|
Sections that were moved:
|
||||||
|
|
||||||
|
[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
|
||||||
|
```
|
||||||
|
|
||||||
|
Use the relative style to link to the new file so that the versioned docs continue to work.
|
||||||
|
|
||||||
|
For an example of a rich moved sections set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/master/docs/source/main_classes/trainer.mdx).
|
||||||
|
|
||||||
- Look at the checks at the bottom of the conversation page of your PR (you may need to click on "show all checks" to
|
|
||||||
expand them).
|
|
||||||
- Click on "details" next to the `ci/circleci: build_doc` check.
|
|
||||||
- In the new window, click on the "Artifacts" tab.
|
|
||||||
- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a
|
|
||||||
preview.
|
|
||||||
|
|
||||||
## Writing Documentation - Specification
|
## Writing Documentation - Specification
|
||||||
|
|
||||||
The `huggingface/transformers` documentation follows the
|
The `huggingface/transformers` documentation follows the
|
||||||
[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
|
[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
|
||||||
mostly written in ReStructuredText
|
although we can write them directly in Markdown.
|
||||||
([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html),
|
|
||||||
[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)).
|
|
||||||
|
|
||||||
|
|
||||||
### Adding a new tutorial
|
### Adding a new tutorial
|
||||||
|
|
||||||
Adding a new tutorial or section is done in two steps:
|
Adding a new tutorial or section is done in two steps:
|
||||||
|
|
||||||
- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
|
- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
|
||||||
- Link that file in `./source/index.rst` on the correct toc-tree.
|
- Link that file in `./source/_toctree.yml` on the correct toc-tree.
|
||||||
|
|
||||||
Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
|
Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
|
||||||
depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
|
depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
|
||||||
@ -122,8 +112,8 @@ four.
|
|||||||
|
|
||||||
When adding a new model:
|
When adding a new model:
|
||||||
|
|
||||||
- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template).
|
- Create a file `xxx.mdx` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
|
||||||
- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
|
- Link that file in `./source/_toctree.yml`.
|
||||||
- Write a short overview of the model:
|
- Write a short overview of the model:
|
||||||
- Overview with paper & authors
|
- Overview with paper & authors
|
||||||
- Paper abstract
|
- Paper abstract
|
||||||
@ -137,64 +127,82 @@ When adding a new model:
|
|||||||
- PyTorch head models
|
- PyTorch head models
|
||||||
- TensorFlow base model
|
- TensorFlow base model
|
||||||
- TensorFlow head models
|
- TensorFlow head models
|
||||||
|
- Flax base model
|
||||||
|
- Flax head models
|
||||||
|
|
||||||
|
These classes should be added using our Markdown syntax. Usually as follows:
|
||||||
|
|
||||||
These classes should be added using the RST syntax. Usually as follows:
|
|
||||||
```
|
```
|
||||||
XXXConfig
|
## XXXConfig
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.XXXConfig
|
[[autodoc]] XXXConfig
|
||||||
:members:
|
|
||||||
```
|
```
|
||||||
|
|
||||||
This will include every public method of the configuration that is documented. If for some reason you wish for a method
|
This will include every public method of the configuration that is documented. If for some reason you wish for a method
|
||||||
not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
|
not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
|
||||||
|
|
||||||
```
|
```
|
||||||
XXXTokenizer
|
## XXXTokenizer
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.XXXTokenizer
|
[[autodoc]] XXXTokenizer
|
||||||
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
- build_inputs_with_special_tokens
|
||||||
create_token_type_ids_from_sequences, save_vocabulary
|
- get_special_tokens_mask
|
||||||
|
- create_token_type_ids_from_sequences
|
||||||
|
- save_vocabulary
|
||||||
|
```
|
||||||
|
|
||||||
|
If you just want to add a method that is not documented (for instance magic method like `__call__` are not documented
|
||||||
|
byt default) you can put the list of methods to add in a list that contains `all`:
|
||||||
|
|
||||||
|
```
|
||||||
|
## XXXTokenizer
|
||||||
|
|
||||||
|
[[autodoc]] XXXTokenizer
|
||||||
|
- all
|
||||||
|
- __call__
|
||||||
```
|
```
|
||||||
|
|
||||||
### Writing source documentation
|
### Writing source documentation
|
||||||
|
|
||||||
Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as
|
Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
|
||||||
an object using the :obj: syntax: :obj:\`like so\`. Note that argument names and objects like True, None or any strings
|
and objects like True, None or any strings should usually be put in `code`.
|
||||||
should usually be put in `code`.
|
|
||||||
|
|
||||||
When mentioning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
|
When mentioning a class, function or method, it is recommended to use our syntax for internal links so that our tool
|
||||||
linked by Sphinx: :class:\`~transformers.XXXClass\`
|
adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or
|
||||||
|
function to be in the main package.
|
||||||
|
|
||||||
When mentioning a function, it is recommended to use the :func: syntax as the mentioned function will be automatically
|
If you want to create a link to some internal class or function, you need to
|
||||||
linked by Sphinx: :func:\`~transformers.function\`.
|
provide its path. For instance: \[\`file_utils.ModelOutput\`\]. This will be converted into a link with
|
||||||
|
`file_utils.ModelOutput` in the description. To get rid of the path and only keep the name of the object you are
|
||||||
|
linking to in the description, add a ~: \[\`~file_utils.ModelOutput\`\] will generate a link with `ModelOutput` in the description.
|
||||||
|
|
||||||
When mentioning a method, it is recommended to use the :meth: syntax as the mentioned method will be automatically
|
The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].
|
||||||
linked by Sphinx: :meth:\`~transformers.XXXClass.method\`.
|
|
||||||
|
|
||||||
Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
|
|
||||||
|
|
||||||
#### Defining arguments in a method
|
#### Defining arguments in a method
|
||||||
|
|
||||||
Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
|
Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
|
||||||
The argument should be followed by its type, with its shape if it is a tensor, and a line return.
|
an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon and its
|
||||||
Another indentation is necessary before writing the description of the argument.
|
description:
|
||||||
|
|
||||||
|
```
|
||||||
|
Args:
|
||||||
|
n_layers (`int`): The number of layers of the model.
|
||||||
|
```
|
||||||
|
|
||||||
|
If the description is too long to fit in one line, another indentation is necessary before writing the description
|
||||||
|
after th argument.
|
||||||
|
|
||||||
Here's an example showcasing everything so far:
|
Here's an example showcasing everything so far:
|
||||||
|
|
||||||
```
|
```
|
||||||
Args:
|
Args:
|
||||||
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||||
Indices of input sequence tokens in the vocabulary.
|
Indices of input sequence tokens in the vocabulary.
|
||||||
|
|
||||||
Indices can be obtained using :class:`~transformers.AlbertTokenizer`.
|
Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
|
||||||
See :meth:`~transformers.PreTrainedTokenizer.encode` and
|
[`~PreTrainedTokenizer.__call__`] for details.
|
||||||
:meth:`~transformers.PreTrainedTokenizer.__call__` for details.
|
|
||||||
|
|
||||||
`What are input IDs? <../glossary.html#input-ids>`__
|
[What are input IDs?](../glossary#input-ids)
|
||||||
```
|
```
|
||||||
|
|
||||||
For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
|
For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
|
||||||
@ -208,93 +216,183 @@ then its documentation should look like this:
|
|||||||
|
|
||||||
```
|
```
|
||||||
Args:
|
Args:
|
||||||
x (:obj:`str`, `optional`):
|
x (`str`, *optional*):
|
||||||
This argument controls ...
|
This argument controls ...
|
||||||
a (:obj:`float`, `optional`, defaults to 1):
|
a (`float`, *optional*, defaults to 1):
|
||||||
This argument is used to ...
|
This argument is used to ...
|
||||||
```
|
```
|
||||||
|
|
||||||
Note that we always omit the "defaults to :obj:\`None\`" when None is the default for any argument. Also note that even
|
Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
|
||||||
if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
|
if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
|
||||||
however write as many lines as you want in the indented description (see the example above with `input_ids`).
|
however write as many lines as you want in the indented description (see the example above with `input_ids`).
|
||||||
|
|
||||||
#### Writing a multi-line code block
|
#### Writing a multi-line code block
|
||||||
|
|
||||||
Multi-line code blocks can be useful for displaying examples. They are done like so:
|
Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
|
||||||
|
|
||||||
|
|
||||||
|
````
|
||||||
```
|
```
|
||||||
Example::
|
# first line of code
|
||||||
|
# second line
|
||||||
# first line of code
|
# etc
|
||||||
# second line
|
|
||||||
# etc
|
|
||||||
```
|
```
|
||||||
|
````
|
||||||
The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
|
|
||||||
|
|
||||||
We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
|
We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
|
||||||
the results stay consistent with the library.
|
the results stay consistent with the library.
|
||||||
|
|
||||||
#### Writing a return block
|
#### Writing a return block
|
||||||
|
|
||||||
Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
|
The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
|
||||||
The first line should be the type of the return, followed by a line return. No need to indent further for the elements
|
The first line should be the type of the return, followed by a line return. No need to indent further for the elements
|
||||||
building the return.
|
building the return.
|
||||||
|
|
||||||
Here's an example for tuple return, comprising several objects:
|
|
||||||
|
|
||||||
```
|
|
||||||
Returns:
|
|
||||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
|
||||||
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
|
||||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
|
|
||||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
|
||||||
```
|
|
||||||
|
|
||||||
Here's an example for a single value return:
|
Here's an example for a single value return:
|
||||||
|
|
||||||
```
|
```
|
||||||
Returns:
|
Returns:
|
||||||
:obj:`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
|
`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Adding a new section
|
Here's an example for tuple return, comprising several objects:
|
||||||
|
|
||||||
In ReST section headers are designated as such with the help of a line of underlying characters, e.g.,:
|
|
||||||
|
|
||||||
```
|
```
|
||||||
Section 1
|
Returns:
|
||||||
^^^^^^^^^^^^^^^^^^
|
`tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
|
||||||
|
- ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
|
||||||
Sub-section 1
|
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
|
||||||
~~~~~~~~~~~~~~~~~~
|
- **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
```
|
```
|
||||||
|
|
||||||
ReST allows the use of any characters to designate different section levels, as long as they are used consistently within the same document. For details see [sections doc](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#sections). Because there is no standard different documents often end up using different characters for the same levels which makes it very difficult to know which character to use when creating a new section.
|
#### Adding an image
|
||||||
|
|
||||||
Specifically, if when running `make docs` you get an error like:
|
Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
|
||||||
```
|
the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
|
||||||
docs/source/main_classes/trainer.rst:127:Title level inconsistent:
|
them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
|
||||||
```
|
If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
|
||||||
you picked an inconsistent character for some of the levels.
|
to this dataset.
|
||||||
|
|
||||||
But how do you know which characters you must use for an already existing level or when adding a new level?
|
## Styling the docstring
|
||||||
|
|
||||||
|
We have an automatic script running with the `make style` comment that will make sure that:
|
||||||
|
- the docstrings fully take advantage of the line width
|
||||||
|
- all code examples are formatted using black, like the code of the Transformers library
|
||||||
|
|
||||||
|
This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
|
||||||
|
recommended to commit your changes before running `make style`, so you can revert the changes done by that script
|
||||||
|
easily.
|
||||||
|
|
||||||
|
# Testing documentation examples
|
||||||
|
|
||||||
|
Good documentation oftens comes with an example of how a specific function or class should be used.
|
||||||
|
Each model class should contain at least one example showcasing
|
||||||
|
how to use this model class in inference. *E.g.* the class [Wav2Vec2ForCTC](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC)
|
||||||
|
includes an example of how to transcribe speech to text in the
|
||||||
|
[docstring of its forward function](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC.forward).
|
||||||
|
|
||||||
|
## Writing documenation examples
|
||||||
|
|
||||||
|
The syntax for Example docstrings can look as follows:
|
||||||
|
|
||||||
You can use this helper script:
|
|
||||||
```
|
```
|
||||||
perl -ne '/^(.)\1{100,}/ && do { $h{$1}=++$c if !$h{$1} }; END { %h = reverse %h ; print "$_ $h{$_}\n" for sort keys %h}' docs/source/main_classes/trainer.rst
|
Example:
|
||||||
1 -
|
|
||||||
2 ~
|
```python
|
||||||
3 ^
|
>>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
||||||
4 =
|
>>> from datasets import load_dataset
|
||||||
5 "
|
>>> import torch
|
||||||
|
|
||||||
|
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
|
||||||
|
>>> dataset = dataset.sort("id")
|
||||||
|
>>> sampling_rate = dataset.features["audio"].sampling_rate
|
||||||
|
|
||||||
|
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
>>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
|
>>> # audio file is decoded on the fly
|
||||||
|
>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
|
||||||
|
>>> with torch.no_grad():
|
||||||
|
... logits = model(**inputs).logits
|
||||||
|
>>> predicted_ids = torch.argmax(logits, dim=-1)
|
||||||
|
|
||||||
|
>>> # transcribe speech
|
||||||
|
>>> transcription = processor.batch_decode(predicted_ids)
|
||||||
|
>>> transcription[0]
|
||||||
|
'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'
|
||||||
|
```
|
||||||
```
|
```
|
||||||
|
|
||||||
This tells you which characters have already been assigned for each level.
|
The docstring should give a minimal, clear example of how the respective model
|
||||||
|
is to be used in inference and also include the expected (ideally sensible)
|
||||||
|
output.
|
||||||
|
Often, readers will try out the example before even going through the function
|
||||||
|
or class definitions. Therefore it is of utmost importance that the example
|
||||||
|
works as expected.
|
||||||
|
|
||||||
So using this particular example's output -- if your current section's header uses `=` as its underline character, you now know you're at level 4, and if you want to add a sub-section header you know you want `"` as it'd level 5.
|
## Docstring testing
|
||||||
|
|
||||||
If you needed to add yet another sub-level, then pick a character that is not used already. That is you must pick a character that is not in the output of that script.
|
To do so each example should be included in the doctests.
|
||||||
|
We use pytests' [doctest integration](https://docs.pytest.org/doctest.html) to verify that all of our examples run correctly.
|
||||||
|
For Transformers, the doctests are run on a daily basis via GitHub Actions as can be
|
||||||
|
seen [here](https://github.com/huggingface/transformers/actions/workflows/doctests.yml).
|
||||||
|
|
||||||
Here is the full list of characters that can be used in this context: `= - ` : ' " ~ ^ _ * + # < >`
|
To include your example in the daily doctests, you need add the filename that
|
||||||
|
contains the example docstring to the [documentation_tests.txt](../utils/documentation_tests.txt).
|
||||||
|
|
||||||
|
### For Python files
|
||||||
|
|
||||||
|
You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python utils/prepare_for_doc_test.py src docs
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py -sv --doctest-continue-on-failure
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to isolate a specific docstring, just add `::` after the file name then type the whole path of the function/class/method whose docstring you want to test. For instance, here is how to just test the forward method of `Wav2Vec2ForCTC`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure
|
||||||
|
```
|
||||||
|
|
||||||
|
Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python utils/prepare_for_doc_test.py src docs --remove_new_line
|
||||||
|
```
|
||||||
|
|
||||||
|
### For Markdown files
|
||||||
|
|
||||||
|
You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python utils/prepare_for_doc_test.py src docs
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can test locally a given file with this command (here testing the quicktour):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest --doctest-modules docs/source/quicktour.mdx -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
|
||||||
|
```
|
||||||
|
|
||||||
|
Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python utils/prepare_for_doc_test.py src docs --remove_new_line
|
||||||
|
```
|
||||||
|
|
||||||
|
### Writing doctests
|
||||||
|
|
||||||
|
Here are a few tips to help you debug the doctests and make them pass:
|
||||||
|
|
||||||
|
- The outputs of the code need to match the expected output **exactly**, so make sure you have the same outputs. In particular doctest will see a difference between single quotes and double quotes, or a missing parenthesis. The only exceptions to that rule are:
|
||||||
|
* whitespace: one give whitespace (space, tabulation, new line) is equivalent to any number of whitespace, so you can add new lines where there are spaces to make your output more readable.
|
||||||
|
* numerical values: you should never put more than 4 or 5 digits to expected results as different setups or library versions might get you slightly different results. `doctest` is configure to ignore any difference lower than the precision to which you wrote (so 1e-4 if you write 4 digits).
|
||||||
|
- Don't leave a block of code that is very long to execute. If you can't make it fast, you can either not use the doctest syntax on it (so that it's ignored), or if you want to use the doctest syntax to show the results, you can add a comment `# doctest: +SKIP` at the end of the lines of code too long to execute
|
||||||
|
- Each line of code that produces a result needs to have that result written below. You can ignore an output if you don't want to show it in your code example by adding a comment ` # doctest: +IGNORE_RESULT` at the end of the line of code produing it.
|
||||||
|
@ -11,28 +11,54 @@
|
|||||||
title: Glossary
|
title: Glossary
|
||||||
title: Get started
|
title: Get started
|
||||||
- sections:
|
- sections:
|
||||||
|
- local: pipeline_tutorial
|
||||||
|
title: Pipelines for inference
|
||||||
|
- local: autoclass_tutorial
|
||||||
|
title: Load pretrained instances with an AutoClass
|
||||||
|
- local: preprocessing
|
||||||
|
title: Preprocess
|
||||||
- local: task_summary
|
- local: task_summary
|
||||||
title: Summary of the tasks
|
title: Summary of the tasks
|
||||||
- local: model_summary
|
- local: model_summary
|
||||||
title: Summary of the models
|
title: Summary of the models
|
||||||
- local: preprocessing
|
|
||||||
title: Preprocessing data
|
|
||||||
- local: training
|
- local: training
|
||||||
title: Fine-tuning a pretrained model
|
title: Fine-tuning a pretrained model
|
||||||
|
- local: accelerate
|
||||||
|
title: Distributed training with 🤗 Accelerate
|
||||||
- local: model_sharing
|
- local: model_sharing
|
||||||
title: Model sharing and uploading
|
title: Share a model
|
||||||
- local: tokenizer_summary
|
- local: tokenizer_summary
|
||||||
title: Summary of the tokenizers
|
title: Summary of the tokenizers
|
||||||
- local: multilingual
|
- local: multilingual
|
||||||
title: Multi-lingual models
|
title: Multi-lingual models
|
||||||
title: "Using 🤗 Transformers"
|
title: Tutorials
|
||||||
- sections:
|
- sections:
|
||||||
- local: examples
|
- local: create_a_model
|
||||||
title: Examples
|
title: Create a custom model
|
||||||
|
- local: multilingual
|
||||||
|
title: Inference for multilingual models
|
||||||
- local: troubleshooting
|
- local: troubleshooting
|
||||||
title: Troubleshooting
|
title: Troubleshooting
|
||||||
- local: custom_datasets
|
- local: custom_datasets
|
||||||
title: Fine-tuning with custom datasets
|
title: Fine-tuning with custom datasets
|
||||||
|
- sections:
|
||||||
|
- local: tasks/sequence_classification
|
||||||
|
title: Text classification
|
||||||
|
- local: tasks/token_classification
|
||||||
|
title: Token classification
|
||||||
|
- local: tasks/question_answering
|
||||||
|
title: Question answering
|
||||||
|
- local: tasks/language_modeling
|
||||||
|
title: Language modeling
|
||||||
|
- local: tasks/translation
|
||||||
|
title: Translation
|
||||||
|
- local: tasks/summarization
|
||||||
|
title: Summarization
|
||||||
|
- local: tasks/multiple_choice
|
||||||
|
title: Multiple choice
|
||||||
|
title: Fine-tune for downstream tasks
|
||||||
|
- local: run_scripts
|
||||||
|
title: Train with a script
|
||||||
- local: notebooks
|
- local: notebooks
|
||||||
title: "🤗 Transformers Notebooks"
|
title: "🤗 Transformers Notebooks"
|
||||||
- local: sagemaker
|
- local: sagemaker
|
||||||
@ -60,10 +86,12 @@
|
|||||||
- local: debugging
|
- local: debugging
|
||||||
title: Debugging
|
title: Debugging
|
||||||
- local: serialization
|
- local: serialization
|
||||||
title: Exporting transformers models
|
title: Exporting 🤗 Transformers models
|
||||||
|
- local: custom_models
|
||||||
|
title: Sharing custom models
|
||||||
- local: pr_checks
|
- local: pr_checks
|
||||||
title: Checks on a Pull Request
|
title: Checks on a Pull Request
|
||||||
title: Advanced guides
|
title: How-to guides
|
||||||
- sections:
|
- sections:
|
||||||
- local: bertology
|
- local: bertology
|
||||||
title: BERTology
|
title: BERTology
|
||||||
@ -86,6 +114,8 @@
|
|||||||
title: Logging
|
title: Logging
|
||||||
- local: main_classes/model
|
- local: main_classes/model
|
||||||
title: Models
|
title: Models
|
||||||
|
- local: main_classes/onnx
|
||||||
|
title: ONNX
|
||||||
- local: main_classes/optimizer_schedules
|
- local: main_classes/optimizer_schedules
|
||||||
title: Optimization
|
title: Optimization
|
||||||
- local: main_classes/output
|
- local: main_classes/output
|
||||||
@ -120,17 +150,17 @@
|
|||||||
title: BERT
|
title: BERT
|
||||||
- local: model_doc/bertweet
|
- local: model_doc/bertweet
|
||||||
title: Bertweet
|
title: Bertweet
|
||||||
- local: model_doc/bertgeneration
|
- local: model_doc/bert-generation
|
||||||
title: BertGeneration
|
title: BertGeneration
|
||||||
- local: model_doc/bert_japanese
|
- local: model_doc/bert-japanese
|
||||||
title: BertJapanese
|
title: BertJapanese
|
||||||
- local: model_doc/bigbird
|
- local: model_doc/big_bird
|
||||||
title: BigBird
|
title: BigBird
|
||||||
- local: model_doc/bigbird_pegasus
|
- local: model_doc/bigbird_pegasus
|
||||||
title: BigBirdPegasus
|
title: BigBirdPegasus
|
||||||
- local: model_doc/blenderbot
|
- local: model_doc/blenderbot
|
||||||
title: Blenderbot
|
title: Blenderbot
|
||||||
- local: model_doc/blenderbot_small
|
- local: model_doc/blenderbot-small
|
||||||
title: Blenderbot Small
|
title: Blenderbot Small
|
||||||
- local: model_doc/bort
|
- local: model_doc/bort
|
||||||
title: BORT
|
title: BORT
|
||||||
@ -140,6 +170,8 @@
|
|||||||
title: CamemBERT
|
title: CamemBERT
|
||||||
- local: model_doc/canine
|
- local: model_doc/canine
|
||||||
title: CANINE
|
title: CANINE
|
||||||
|
- local: model_doc/convnext
|
||||||
|
title: ConvNeXT
|
||||||
- local: model_doc/clip
|
- local: model_doc/clip
|
||||||
title: CLIP
|
title: CLIP
|
||||||
- local: model_doc/convbert
|
- local: model_doc/convbert
|
||||||
@ -148,9 +180,11 @@
|
|||||||
title: CPM
|
title: CPM
|
||||||
- local: model_doc/ctrl
|
- local: model_doc/ctrl
|
||||||
title: CTRL
|
title: CTRL
|
||||||
|
- local: model_doc/data2vec
|
||||||
|
title: Data2Vec
|
||||||
- local: model_doc/deberta
|
- local: model_doc/deberta
|
||||||
title: DeBERTa
|
title: DeBERTa
|
||||||
- local: model_doc/deberta_v2
|
- local: model_doc/deberta-v2
|
||||||
title: DeBERTa-v2
|
title: DeBERTa-v2
|
||||||
- local: model_doc/deit
|
- local: model_doc/deit
|
||||||
title: DeiT
|
title: DeiT
|
||||||
@ -164,7 +198,7 @@
|
|||||||
title: DPR
|
title: DPR
|
||||||
- local: model_doc/electra
|
- local: model_doc/electra
|
||||||
title: ELECTRA
|
title: ELECTRA
|
||||||
- local: model_doc/encoderdecoder
|
- local: model_doc/encoder-decoder
|
||||||
title: Encoder Decoder Models
|
title: Encoder Decoder Models
|
||||||
- local: model_doc/flaubert
|
- local: model_doc/flaubert
|
||||||
title: FlauBERT
|
title: FlauBERT
|
||||||
@ -175,7 +209,7 @@
|
|||||||
- local: model_doc/funnel
|
- local: model_doc/funnel
|
||||||
title: Funnel Transformer
|
title: Funnel Transformer
|
||||||
- local: model_doc/herbert
|
- local: model_doc/herbert
|
||||||
title: herBERT
|
title: HerBERT
|
||||||
- local: model_doc/ibert
|
- local: model_doc/ibert
|
||||||
title: I-BERT
|
title: I-BERT
|
||||||
- local: model_doc/imagegpt
|
- local: model_doc/imagegpt
|
||||||
@ -196,14 +230,18 @@
|
|||||||
title: LXMERT
|
title: LXMERT
|
||||||
- local: model_doc/marian
|
- local: model_doc/marian
|
||||||
title: MarianMT
|
title: MarianMT
|
||||||
|
- local: model_doc/maskformer
|
||||||
|
title: MaskFormer
|
||||||
- local: model_doc/m2m_100
|
- local: model_doc/m2m_100
|
||||||
title: M2M100
|
title: M2M100
|
||||||
- local: model_doc/mbart
|
- local: model_doc/mbart
|
||||||
title: MBart and MBart-50
|
title: MBart and MBart-50
|
||||||
- local: model_doc/megatron_bert
|
- local: model_doc/megatron-bert
|
||||||
title: MegatronBERT
|
title: MegatronBERT
|
||||||
- local: model_doc/megatron_gpt2
|
- local: model_doc/megatron_gpt2
|
||||||
title: MegatronGPT2
|
title: MegatronGPT2
|
||||||
|
- local: model_doc/mluke
|
||||||
|
title: MLUKE
|
||||||
- local: model_doc/mobilebert
|
- local: model_doc/mobilebert
|
||||||
title: MobileBERT
|
title: MobileBERT
|
||||||
- local: model_doc/mluke
|
- local: model_doc/mluke
|
||||||
@ -212,7 +250,9 @@
|
|||||||
title: MPNet
|
title: MPNet
|
||||||
- local: model_doc/mt5
|
- local: model_doc/mt5
|
||||||
title: MT5
|
title: MT5
|
||||||
- local: model_doc/gpt
|
- local: model_doc/nystromformer
|
||||||
|
title: Nyströmformer
|
||||||
|
- local: model_doc/openai-gpt
|
||||||
title: OpenAI GPT
|
title: OpenAI GPT
|
||||||
- local: model_doc/gpt2
|
- local: model_doc/gpt2
|
||||||
title: OpenAI GPT2
|
title: OpenAI GPT2
|
||||||
@ -228,12 +268,18 @@
|
|||||||
title: Pegasus
|
title: Pegasus
|
||||||
- local: model_doc/phobert
|
- local: model_doc/phobert
|
||||||
title: PhoBERT
|
title: PhoBERT
|
||||||
|
- local: model_doc/plbart
|
||||||
|
title: PLBart
|
||||||
|
- local: model_doc/poolformer
|
||||||
|
title: PoolFormer
|
||||||
- local: model_doc/prophetnet
|
- local: model_doc/prophetnet
|
||||||
title: ProphetNet
|
title: ProphetNet
|
||||||
- local: model_doc/qdqbert
|
- local: model_doc/qdqbert
|
||||||
title: QDQBert
|
title: QDQBert
|
||||||
- local: model_doc/rag
|
- local: model_doc/rag
|
||||||
title: RAG
|
title: RAG
|
||||||
|
- local: model_doc/realm
|
||||||
|
title: REALM
|
||||||
- local: model_doc/reformer
|
- local: model_doc/reformer
|
||||||
title: Reformer
|
title: Reformer
|
||||||
- local: model_doc/rembert
|
- local: model_doc/rembert
|
||||||
@ -248,9 +294,9 @@
|
|||||||
title: SegFormer
|
title: SegFormer
|
||||||
- local: model_doc/sew
|
- local: model_doc/sew
|
||||||
title: SEW
|
title: SEW
|
||||||
- local: model_doc/sew_d
|
- local: model_doc/sew-d
|
||||||
title: SEW-D
|
title: SEW-D
|
||||||
- local: model_doc/speechencoderdecoder
|
- local: model_doc/speech-encoder-decoder
|
||||||
title: Speech Encoder Decoder Models
|
title: Speech Encoder Decoder Models
|
||||||
- local: model_doc/speech_to_text
|
- local: model_doc/speech_to_text
|
||||||
title: Speech2Text
|
title: Speech2Text
|
||||||
@ -260,40 +306,58 @@
|
|||||||
title: Splinter
|
title: Splinter
|
||||||
- local: model_doc/squeezebert
|
- local: model_doc/squeezebert
|
||||||
title: SqueezeBERT
|
title: SqueezeBERT
|
||||||
|
- local: model_doc/swin
|
||||||
|
title: Swin Transformer
|
||||||
- local: model_doc/t5
|
- local: model_doc/t5
|
||||||
title: T5
|
title: T5
|
||||||
- local: model_doc/t5v1.1
|
- local: model_doc/t5v1.1
|
||||||
title: T5v1.1
|
title: T5v1.1
|
||||||
- local: model_doc/tapas
|
- local: model_doc/tapas
|
||||||
title: TAPAS
|
title: TAPAS
|
||||||
- local: model_doc/transformerxl
|
- local: model_doc/transfo-xl
|
||||||
title: Transformer XL
|
title: Transformer XL
|
||||||
- local: model_doc/trocr
|
- local: model_doc/trocr
|
||||||
title: TrOCR
|
title: TrOCR
|
||||||
- local: model_doc/unispeech
|
- local: model_doc/unispeech
|
||||||
title: UniSpeech
|
title: UniSpeech
|
||||||
- local: model_doc/unispeech_sat
|
- local: model_doc/unispeech-sat
|
||||||
title: UniSpeech-SAT
|
title: UniSpeech-SAT
|
||||||
- local: model_doc/visionencoderdecoder
|
- local: model_doc/vilt
|
||||||
|
title: ViLT
|
||||||
|
- local: model_doc/vision-encoder-decoder
|
||||||
title: Vision Encoder Decoder Models
|
title: Vision Encoder Decoder Models
|
||||||
- local: model_doc/vision_text_dual_encoder
|
- local: model_doc/vision-text-dual-encoder
|
||||||
title: Vision Text Dual Encoder
|
title: Vision Text Dual Encoder
|
||||||
- local: model_doc/vit
|
- local: model_doc/vit
|
||||||
title: Vision Transformer (ViT)
|
title: Vision Transformer (ViT)
|
||||||
|
- local: model_doc/vit_mae
|
||||||
|
title: ViTMAE
|
||||||
- local: model_doc/visual_bert
|
- local: model_doc/visual_bert
|
||||||
title: VisualBERT
|
title: VisualBERT
|
||||||
- local: model_doc/wav2vec2
|
- local: model_doc/wav2vec2
|
||||||
title: Wav2Vec2
|
title: Wav2Vec2
|
||||||
|
- local: model_doc/wav2vec2_phoneme
|
||||||
|
title: Wav2Vec2Phoneme
|
||||||
|
- local: model_doc/wavlm
|
||||||
|
title: WavLM
|
||||||
|
- local: model_doc/xglm
|
||||||
|
title: XGLM
|
||||||
- local: model_doc/xlm
|
- local: model_doc/xlm
|
||||||
title: XLM
|
title: XLM
|
||||||
- local: model_doc/xlmprophetnet
|
- local: model_doc/xlm-prophetnet
|
||||||
title: XLM-ProphetNet
|
title: XLM-ProphetNet
|
||||||
- local: model_doc/xlmroberta
|
- local: model_doc/xlm-roberta
|
||||||
title: XLM-RoBERTa
|
title: XLM-RoBERTa
|
||||||
|
- local: model_doc/xlm-roberta-xl
|
||||||
|
title: XLM-RoBERTa-XL
|
||||||
- local: model_doc/xlnet
|
- local: model_doc/xlnet
|
||||||
title: XLNet
|
title: XLNet
|
||||||
- local: model_doc/xlsr_wav2vec2
|
- local: model_doc/xlsr_wav2vec2
|
||||||
title: XLSR-Wav2Vec2
|
title: XLSR-Wav2Vec2
|
||||||
|
- local: model_doc/xls_r
|
||||||
|
title: XLS-R
|
||||||
|
- local: model_doc/yoso
|
||||||
|
title: YOSO
|
||||||
title: Models
|
title: Models
|
||||||
- sections:
|
- sections:
|
||||||
- local: internal/modeling_utils
|
- local: internal/modeling_utils
|
||||||
|
132
docs/source/accelerate.mdx
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Distributed training with 🤗 Accelerate
|
||||||
|
|
||||||
|
As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate/index.html) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
Get started by installing 🤗 Accelerate:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install accelerate
|
||||||
|
```
|
||||||
|
|
||||||
|
Then import and create an [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator) object. `Accelerator` will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from accelerate import Accelerator
|
||||||
|
|
||||||
|
>>> accelerator = Accelerator()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prepare to accelerate
|
||||||
|
|
||||||
|
The next step is to pass all the relevant training objects to the [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare) method. This includes your training and evaluation DataLoaders, a model and an optimizer:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
|
||||||
|
... train_dataloader, eval_dataloader, model, optimizer
|
||||||
|
... )
|
||||||
|
```
|
||||||
|
|
||||||
|
## Backward
|
||||||
|
|
||||||
|
The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`backward`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.backward) method:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> for epoch in range(num_epochs):
|
||||||
|
... for batch in train_dataloader:
|
||||||
|
... outputs = model(**batch)
|
||||||
|
... loss = outputs.loss
|
||||||
|
... accelerator.backward(loss)
|
||||||
|
|
||||||
|
... optimizer.step()
|
||||||
|
... lr_scheduler.step()
|
||||||
|
... optimizer.zero_grad()
|
||||||
|
... progress_bar.update(1)
|
||||||
|
```
|
||||||
|
|
||||||
|
As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training!
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ from accelerate import Accelerator
|
||||||
|
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
|
||||||
|
|
||||||
|
+ accelerator = Accelerator()
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
|
||||||
|
optimizer = AdamW(model.parameters(), lr=3e-5)
|
||||||
|
|
||||||
|
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||||
|
- model.to(device)
|
||||||
|
|
||||||
|
+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
|
||||||
|
+ train_dataloader, eval_dataloader, model, optimizer
|
||||||
|
+ )
|
||||||
|
|
||||||
|
num_epochs = 3
|
||||||
|
num_training_steps = num_epochs * len(train_dataloader)
|
||||||
|
lr_scheduler = get_scheduler(
|
||||||
|
"linear",
|
||||||
|
optimizer=optimizer,
|
||||||
|
num_warmup_steps=0,
|
||||||
|
num_training_steps=num_training_steps
|
||||||
|
)
|
||||||
|
|
||||||
|
progress_bar = tqdm(range(num_training_steps))
|
||||||
|
|
||||||
|
model.train()
|
||||||
|
for epoch in range(num_epochs):
|
||||||
|
for batch in train_dataloader:
|
||||||
|
- batch = {k: v.to(device) for k, v in batch.items()}
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs.loss
|
||||||
|
- loss.backward()
|
||||||
|
+ accelerator.backward(loss)
|
||||||
|
|
||||||
|
optimizer.step()
|
||||||
|
lr_scheduler.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
progress_bar.update(1)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Train
|
||||||
|
|
||||||
|
Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory.
|
||||||
|
|
||||||
|
### Train with a script
|
||||||
|
|
||||||
|
If you are running your training from a script, run the following command to create and save a configuration file:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
accelerate config
|
||||||
|
```
|
||||||
|
|
||||||
|
Then launch your training with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
accelerate launch train.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Train with a notebook
|
||||||
|
|
||||||
|
🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to `notebook_launcher`:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from accelerate import notebook_launcher
|
||||||
|
|
||||||
|
>>> notebook_launcher(training_function)
|
||||||
|
```
|
||||||
|
|
||||||
|
For more information about 🤗 Accelerate and it's rich features, refer to the [documentation](https://huggingface.co/docs/accelerate/index.html).
|
140
docs/source/add_new_pipeline.mdx
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
-->
|
||||||
|
|
||||||
|
# How to add a pipeline to 🤗 Transformers?
|
||||||
|
|
||||||
|
First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
|
||||||
|
dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
|
||||||
|
as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
|
||||||
|
pipeline (`preprocess`).
|
||||||
|
|
||||||
|
Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
|
||||||
|
`postprocess` method.
|
||||||
|
|
||||||
|
Start by inheriting the base class `Pipeline`. with the 4 methods needed to implement `preprocess`,
|
||||||
|
`_forward`, `postprocess` and `_sanitize_parameters`.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import Pipeline
|
||||||
|
|
||||||
|
|
||||||
|
class MyPipeline(Pipeline):
|
||||||
|
def _sanitize_parameters(self, **kwargs):
|
||||||
|
preprocess_kwargs = {}
|
||||||
|
if "maybe_arg" in kwargs:
|
||||||
|
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
|
||||||
|
return preprocess_kwargs, {}, {}
|
||||||
|
|
||||||
|
def preprocess(self, inputs, maybe_arg=2):
|
||||||
|
model_input = Tensor(inputs["input_ids"])
|
||||||
|
return {"model_input": model_input}
|
||||||
|
|
||||||
|
def _forward(self, model_inputs):
|
||||||
|
# model_inputs == {"model_input": model_input}
|
||||||
|
outputs = self.model(**model_inputs)
|
||||||
|
# Maybe {"logits": Tensor(...)}
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def postprocess(self, model_outputs):
|
||||||
|
best_class = model_outputs["logits"].softmax(-1)
|
||||||
|
return best_class
|
||||||
|
```
|
||||||
|
|
||||||
|
The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
|
||||||
|
pre/postprocessing on the CPU on different threads
|
||||||
|
|
||||||
|
`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
|
||||||
|
contain more information and is usually a `Dict`.
|
||||||
|
|
||||||
|
`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
|
||||||
|
called method as it contains safeguards to make sure everything is working on the expected device. If anything is
|
||||||
|
linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
|
||||||
|
|
||||||
|
`postprocess` methods will take the output of `_forward` and turn it into the final output that were decided
|
||||||
|
earlier.
|
||||||
|
|
||||||
|
`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
|
||||||
|
time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
|
||||||
|
|
||||||
|
The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
|
||||||
|
`_forward` and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
|
||||||
|
allows to keep the default arguments in the function definition which is always more "natural".
|
||||||
|
|
||||||
|
A classic example would be a `top_k` argument in the post processing in classification tasks.
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> pipe = pipeline("my-new-task")
|
||||||
|
>>> pipe("This is a test")
|
||||||
|
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
|
||||||
|
{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
|
||||||
|
|
||||||
|
>>> pipe("This is a test", top_k=2)
|
||||||
|
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
|
||||||
|
```
|
||||||
|
|
||||||
|
In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
|
||||||
|
`_sanitize_parameters` to allow this new parameter.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
def postprocess(self, model_outputs, top_k=5):
|
||||||
|
best_class = model_outputs["logits"].softmax(-1)
|
||||||
|
# Add logic to handle top_k
|
||||||
|
return best_class
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_parameters(self, **kwargs):
|
||||||
|
preprocess_kwargs = {}
|
||||||
|
if "maybe_arg" in kwargs:
|
||||||
|
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
|
||||||
|
|
||||||
|
postprocess_kwargs = {}
|
||||||
|
if "top_k" in kwargs:
|
||||||
|
preprocess_kwargs["top_k"] = kwargs["top_k"]
|
||||||
|
return preprocess_kwargs, {}, postprocess_kwargs
|
||||||
|
```
|
||||||
|
|
||||||
|
Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
|
||||||
|
without requiring users to understand new kind of objects. It's also relatively common to support many different types
|
||||||
|
of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Adding it to the list of supported tasks
|
||||||
|
|
||||||
|
Go to `src/transformers/pipelines/__init__.py` and fill in `SUPPORTED_TASKS` with your newly created pipeline.
|
||||||
|
If possible it should provide a default model.
|
||||||
|
|
||||||
|
## Adding tests
|
||||||
|
|
||||||
|
Create a new file `tests/test_pipelines_MY_PIPELINE.py` with example with the other tests.
|
||||||
|
|
||||||
|
The `run_pipeline_test` function will be very generic and run on small random models on every possible
|
||||||
|
architecture as defined by `model_mapping` and `tf_model_mapping`.
|
||||||
|
|
||||||
|
This is very important to test future compatibility, meaning if someone adds a new model for
|
||||||
|
`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
|
||||||
|
impossible to check for actual values, that's why There is a helper `ANY` that will simply attempt to match the
|
||||||
|
output of the pipeline TYPE.
|
||||||
|
|
||||||
|
You also *need* to implement 2 (ideally 4) tests.
|
||||||
|
|
||||||
|
- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
|
||||||
|
and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
|
||||||
|
- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
|
||||||
|
and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
|
||||||
|
- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
|
||||||
|
make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
|
||||||
|
sure there is no drift in future releases
|
||||||
|
- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
|
||||||
|
make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
|
||||||
|
sure there is no drift in future releases
|
@ -1,143 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
|
|
||||||
How to add a pipeline to 🤗 Transformers?
|
|
||||||
=======================================================================================================================
|
|
||||||
|
|
||||||
First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
|
|
||||||
dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
|
|
||||||
as it makes compatibility easier (even through other languages via JSON). Those will be the :obj:`inputs` of the
|
|
||||||
pipeline (:obj:`preprocess`).
|
|
||||||
|
|
||||||
Then define the :obj:`outputs`. Same policy as the :obj:`inputs`. The simpler, the better. Those will be the outputs of
|
|
||||||
:obj:`postprocess` method.
|
|
||||||
|
|
||||||
Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to implement :obj:`preprocess`,
|
|
||||||
:obj:`_forward`, :obj:`postprocess` and :obj:`_sanitize_parameters`.
|
|
||||||
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
from transformers import Pipeline
|
|
||||||
|
|
||||||
class MyPipeline(Pipeline):
|
|
||||||
def _sanitize_parameters(self, **kwargs):
|
|
||||||
preprocess_kwargs = {}
|
|
||||||
if "maybe_arg" in kwargs:
|
|
||||||
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
|
|
||||||
return preprocess_kwargs, {}, {}
|
|
||||||
|
|
||||||
def preprocess(self, inputs, maybe_arg=2):
|
|
||||||
model_input = Tensor(....)
|
|
||||||
return {"model_input": model_input}
|
|
||||||
|
|
||||||
def _forward(self, model_inputs):
|
|
||||||
# model_inputs == {"model_input": model_input}
|
|
||||||
outputs = self.model(**model_inputs)
|
|
||||||
# Maybe {"logits": Tensor(...)}
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
def postprocess(self, model_outputs):
|
|
||||||
best_class = model_outputs["logits"].softmax(-1)
|
|
||||||
return best_class
|
|
||||||
|
|
||||||
|
|
||||||
The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
|
|
||||||
pre/postprocessing on the CPU on different threads
|
|
||||||
|
|
||||||
:obj:`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
|
|
||||||
contain more information and is usually a :obj:`Dict`.
|
|
||||||
|
|
||||||
:obj:`_forward` is the implementation detail and is not meant to be called directly. :obj:`forward` is the preferred
|
|
||||||
called method as it contains safeguards to make sure everything is working on the expected device. If anything is
|
|
||||||
linked to a real model it belongs in the :obj:`_forward` method, anything else is in the preprocess/postprocess.
|
|
||||||
|
|
||||||
:obj:`postprocess` methods will take the output of :obj:`_forward` and turn it into the final output that were decided
|
|
||||||
earlier.
|
|
||||||
|
|
||||||
:obj:`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
|
|
||||||
time ``pipeline(...., maybe_arg=4)`` or at call time ``pipe = pipeline(...); output = pipe(...., maybe_arg=4)``.
|
|
||||||
|
|
||||||
The returns of :obj:`_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to :obj:`preprocess`,
|
|
||||||
:obj:`_forward` and :obj:`postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
|
|
||||||
allows to keep the default arguments in the function definition which is always more "natural".
|
|
||||||
|
|
||||||
A classic example would be a :obj:`top_k` argument in the post processing in classification tasks.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> pipe = pipeline("my-new-task")
|
|
||||||
>>> pipe("This is a test")
|
|
||||||
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
|
|
||||||
{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
|
|
||||||
|
|
||||||
>>> pipe("This is a test", top_k=2)
|
|
||||||
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
|
|
||||||
|
|
||||||
In order to achieve that, we'll update our :obj:`postprocess` method with a default parameter to :obj:`5`. and edit
|
|
||||||
:obj:`_sanitize_parameters` to allow this new parameter.
|
|
||||||
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
|
|
||||||
def postprocess(self, model_outputs, top_k=5):
|
|
||||||
best_class = model_outputs["logits"].softmax(-1)
|
|
||||||
# Add logic to handle top_k
|
|
||||||
return best_class
|
|
||||||
|
|
||||||
def _sanitize_parameters(self, **kwargs):
|
|
||||||
preprocess_kwargs = {}
|
|
||||||
if "maybe_arg" in kwargs:
|
|
||||||
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
|
|
||||||
|
|
||||||
postprocess_kwargs = {}
|
|
||||||
if "top_k" in kwargs:
|
|
||||||
preprocess_kwargs["top_k"] = kwargs["top_k"]
|
|
||||||
return preprocess_kwargs, {}, postprocess_kwargs
|
|
||||||
|
|
||||||
Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
|
|
||||||
without requiring users to understand new kind of objects. It's also relatively common to support many different types
|
|
||||||
of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Adding it to the list of supported tasks
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Go to ``src/transformers/pipelines/__init__.py`` and fill in :obj:`SUPPORTED_TASKS` with your newly created pipeline.
|
|
||||||
If possible it should provide a default model.
|
|
||||||
|
|
||||||
Adding tests
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Create a new file ``tests/test_pipelines_MY_PIPELINE.py`` with example with the other tests.
|
|
||||||
|
|
||||||
The :obj:`run_pipeline_test` function will be very generic and run on small random models on every possible
|
|
||||||
architecture as defined by :obj:`model_mapping` and :obj:`tf_model_mapping`.
|
|
||||||
|
|
||||||
This is very important to test future compatibility, meaning if someone adds a new model for
|
|
||||||
:obj:`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
|
|
||||||
impossible to check for actual values, that's why There is a helper :obj:`ANY` that will simply attempt to match the
|
|
||||||
output of the pipeline TYPE.
|
|
||||||
|
|
||||||
You also *need* to implement 2 (ideally 4) tests.
|
|
||||||
|
|
||||||
- :obj:`test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
|
|
||||||
and test the pipeline outputs. The results should be the same as :obj:`test_small_model_tf`.
|
|
||||||
- :obj:`test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
|
|
||||||
and test the pipeline outputs. The results should be the same as :obj:`test_small_model_pt`.
|
|
||||||
- :obj:`test_large_model_pt` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
|
|
||||||
make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
|
|
||||||
sure there is no drift in future releases
|
|
||||||
- :obj:`test_large_model_tf` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
|
|
||||||
make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
|
|
||||||
sure there is no drift in future releases
|
|
104
docs/source/autoclass_tutorial.mdx
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Load pretrained instances with an AutoClass
|
||||||
|
|
||||||
|
With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infer and load the correct architecture from a given checkpoint. The `from_pretrained` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different.
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/bert-base-uncased) is an architecture, while `bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
In this tutorial, learn to:
|
||||||
|
|
||||||
|
* Load a pretrained tokenizer.
|
||||||
|
* Load a pretrained feature extractor.
|
||||||
|
* Load a pretrained processor.
|
||||||
|
* Load a pretrained model.
|
||||||
|
|
||||||
|
## AutoTokenizer
|
||||||
|
|
||||||
|
Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model.
|
||||||
|
|
||||||
|
Load a tokenizer with [`AutoTokenizer.from_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
Then tokenize your input as shown below:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> sequence = "In a hole in the ground there lived a hobbit."
|
||||||
|
>>> print(tokenizer(sequence))
|
||||||
|
{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102],
|
||||||
|
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||||
|
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
|
||||||
|
```
|
||||||
|
|
||||||
|
## AutoFeatureExtractor
|
||||||
|
|
||||||
|
For audio and vision tasks, a feature extractor processes the audio signal or image into the correct input format.
|
||||||
|
|
||||||
|
Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import AutoFeatureExtractor
|
||||||
|
|
||||||
|
>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
|
||||||
|
... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
|
||||||
|
... )
|
||||||
|
```
|
||||||
|
|
||||||
|
## AutoProcessor
|
||||||
|
|
||||||
|
Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires a feature extractor to handle images and a tokenizer to handle text; a processor combines both of them.
|
||||||
|
|
||||||
|
Load a processor with [`AutoProcessor.from_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import AutoProcessor
|
||||||
|
|
||||||
|
>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
## AutoModel
|
||||||
|
|
||||||
|
Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import AutoModelForSequenceClassification
|
||||||
|
|
||||||
|
>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||||
|
===PT-TF-SPLIT===
|
||||||
|
>>> from transformers import TFAutoModelForSequenceClassification
|
||||||
|
|
||||||
|
>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
Easily reuse the same checkpoint to load an architecture for a different task:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import AutoModelForTokenClassification
|
||||||
|
|
||||||
|
>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
|
||||||
|
===PT-TF-SPLIT===
|
||||||
|
>>> from transformers import TFAutoModelForTokenClassification
|
||||||
|
|
||||||
|
>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, feature extractor and processor to preprocess a dataset for fine-tuning.
|
@ -12,15 +12,22 @@ specific language governing permissions and limitations under the License.
|
|||||||
|
|
||||||
# Benchmarks
|
# Benchmarks
|
||||||
|
|
||||||
|
<Tip warning={true}>
|
||||||
|
|
||||||
|
Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed
|
||||||
|
and memory complexity of Transformer models.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
[[open-in-colab]]
|
[[open-in-colab]]
|
||||||
|
|
||||||
Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.
|
Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks.
|
||||||
|
|
||||||
A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found [here](https://github.com/huggingface/transformers/tree/master/notebooks/05-benchmark.ipynb).
|
A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found [here](https://github.com/huggingface/notebooks/tree/master/examples/benchmark.ipynb).
|
||||||
|
|
||||||
## How to benchmark 🤗 Transformer models
|
## How to benchmark 🤗 Transformers models
|
||||||
|
|
||||||
The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformer models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_.
|
The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformers models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_.
|
||||||
|
|
||||||
<Tip>
|
<Tip>
|
||||||
|
|
||||||
@ -37,11 +44,12 @@ The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an
|
|||||||
|
|
||||||
>>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
|
>>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
|
||||||
>>> benchmark = PyTorchBenchmark(args)
|
>>> benchmark = PyTorchBenchmark(args)
|
||||||
|
|
||||||
===PT-TF-SPLIT===
|
===PT-TF-SPLIT===
|
||||||
>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
|
>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
|
||||||
|
|
||||||
>>> args = TensorFlowBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
|
>>> args = TensorFlowBenchmarkArguments(
|
||||||
|
... models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
|
||||||
|
... )
|
||||||
>>> benchmark = TensorFlowBenchmark(args)
|
>>> benchmark = TensorFlowBenchmark(args)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -174,7 +182,9 @@ configurations must be inserted with the benchmark args as follows.
|
|||||||
```py
|
```py
|
||||||
>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
|
>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
|
||||||
|
|
||||||
>>> args = PyTorchBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
|
>>> args = PyTorchBenchmarkArguments(
|
||||||
|
... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
|
||||||
|
... )
|
||||||
>>> config_base = BertConfig()
|
>>> config_base = BertConfig()
|
||||||
>>> config_384_hid = BertConfig(hidden_size=384)
|
>>> config_384_hid = BertConfig(hidden_size=384)
|
||||||
>>> config_6_lay = BertConfig(num_hidden_layers=6)
|
>>> config_6_lay = BertConfig(num_hidden_layers=6)
|
||||||
@ -244,7 +254,9 @@ bert-6-lay 8 512 1359
|
|||||||
===PT-TF-SPLIT===
|
===PT-TF-SPLIT===
|
||||||
>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
|
>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
|
||||||
|
|
||||||
>>> args = TensorFlowBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
|
>>> args = TensorFlowBenchmarkArguments(
|
||||||
|
... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
|
||||||
|
... )
|
||||||
>>> config_base = BertConfig()
|
>>> config_base = BertConfig()
|
||||||
>>> config_384_hid = BertConfig(hidden_size=384)
|
>>> config_384_hid = BertConfig(hidden_size=384)
|
||||||
>>> config_6_lay = BertConfig(num_hidden_layers=6)
|
>>> config_6_lay = BertConfig(num_hidden_layers=6)
|
||||||
|
36
docs/source/bertology.mdx
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# BERTology
|
||||||
|
|
||||||
|
There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
|
||||||
|
(that some call "BERTology"). Some good examples of this field are:
|
||||||
|
|
||||||
|
|
||||||
|
- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
|
||||||
|
https://arxiv.org/abs/1905.05950
|
||||||
|
- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
|
||||||
|
- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
|
||||||
|
Manning: https://arxiv.org/abs/1906.04341
|
||||||
|
|
||||||
|
In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
|
||||||
|
help people access the inner representations, mainly adapted from the great work of Paul Michel
|
||||||
|
(https://arxiv.org/abs/1905.10650):
|
||||||
|
|
||||||
|
|
||||||
|
- accessing all the hidden-states of BERT/GPT/GPT-2,
|
||||||
|
- accessing all the attention weights for each head of BERT/GPT/GPT-2,
|
||||||
|
- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
|
||||||
|
in https://arxiv.org/abs/1905.10650.
|
||||||
|
|
||||||
|
To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/master/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on
|
||||||
|
GLUE.
|
@ -1,38 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
BERTology
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
|
|
||||||
(that some call "BERTology"). Some good examples of this field are:
|
|
||||||
|
|
||||||
|
|
||||||
* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
|
|
||||||
https://arxiv.org/abs/1905.05950
|
|
||||||
* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
|
|
||||||
* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
|
|
||||||
Manning: https://arxiv.org/abs/1906.04341
|
|
||||||
|
|
||||||
In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
|
|
||||||
help people access the inner representations, mainly adapted from the great work of Paul Michel
|
|
||||||
(https://arxiv.org/abs/1905.10650):
|
|
||||||
|
|
||||||
|
|
||||||
* accessing all the hidden-states of BERT/GPT/GPT-2,
|
|
||||||
* accessing all the attention weights for each head of BERT/GPT/GPT-2,
|
|
||||||
* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
|
|
||||||
in https://arxiv.org/abs/1905.10650.
|
|
||||||
|
|
||||||
To help you understand and use these features, we have added a specific example script: :prefix_link:`bertology.py
|
|
||||||
<examples/research_projects/bertology/run_bertology.py>` while extract information and prune a model pre-trained on
|
|
||||||
GLUE.
|
|
@ -62,3 +62,4 @@ This page regroups resources around 🤗 Transformers developed by the community
|
|||||||
| [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
|
| [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
|
||||||
| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
|
| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
|
||||||
| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
|
| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
|
||||||
|
| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
|
162
docs/source/converting_tensorflow_models.mdx
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Converting Tensorflow Checkpoints
|
||||||
|
|
||||||
|
A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
|
||||||
|
that can be loaded using the `from_pretrained` methods of the library.
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
|
||||||
|
transformers >= 2.3.0 installation.
|
||||||
|
|
||||||
|
The documentation below reflects the **transformers-cli convert** command format.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
## BERT
|
||||||
|
|
||||||
|
You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the
|
||||||
|
[convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/master/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
|
||||||
|
|
||||||
|
This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated
|
||||||
|
configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from
|
||||||
|
the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
|
||||||
|
be imported using `from_pretrained()` (see example in [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification/run_glue.py) ).
|
||||||
|
|
||||||
|
You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
|
||||||
|
checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (\
|
||||||
|
`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
|
||||||
|
|
||||||
|
To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
|
||||||
|
|
||||||
|
Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
||||||
|
|
||||||
|
transformers-cli convert --model_type bert \
|
||||||
|
--tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
|
||||||
|
--config $BERT_BASE_DIR/bert_config.json \
|
||||||
|
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
|
||||||
|
|
||||||
|
## ALBERT
|
||||||
|
|
||||||
|
Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
|
||||||
|
[convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/master/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
|
||||||
|
|
||||||
|
The CLI takes as input a TensorFlow checkpoint (three files starting with `model.ckpt-best`) and the accompanying
|
||||||
|
configuration file (`albert_config.json`), then creates and saves a PyTorch model. To run this conversion you will
|
||||||
|
need to have TensorFlow and PyTorch installed.
|
||||||
|
|
||||||
|
Here is an example of the conversion process for the pre-trained `ALBERT Base` model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export ALBERT_BASE_DIR=/path/to/albert/albert_base
|
||||||
|
|
||||||
|
transformers-cli convert --model_type albert \
|
||||||
|
--tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
|
||||||
|
--config $ALBERT_BASE_DIR/albert_config.json \
|
||||||
|
--pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/albert#pre-trained-models).
|
||||||
|
|
||||||
|
## OpenAI GPT
|
||||||
|
|
||||||
|
Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
|
||||||
|
save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm)\
|
||||||
|
)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
||||||
|
|
||||||
|
transformers-cli convert --model_type gpt \
|
||||||
|
--tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
||||||
|
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||||
|
[--config OPENAI_GPT_CONFIG] \
|
||||||
|
[--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
|
||||||
|
```
|
||||||
|
|
||||||
|
## OpenAI GPT-2
|
||||||
|
|
||||||
|
Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see [here](https://github.com/openai/gpt-2))
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
||||||
|
|
||||||
|
transformers-cli convert --model_type gpt2 \
|
||||||
|
--tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
|
||||||
|
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||||
|
[--config OPENAI_GPT2_CONFIG] \
|
||||||
|
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Transformer-XL
|
||||||
|
|
||||||
|
Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
||||||
|
|
||||||
|
transformers-cli convert --model_type transfo_xl \
|
||||||
|
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
||||||
|
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||||
|
[--config TRANSFO_XL_CONFIG] \
|
||||||
|
[--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
|
||||||
|
```
|
||||||
|
|
||||||
|
## XLNet
|
||||||
|
|
||||||
|
Here is an example of the conversion process for a pre-trained XLNet model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
||||||
|
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
||||||
|
|
||||||
|
transformers-cli convert --model_type xlnet \
|
||||||
|
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
|
||||||
|
--config $TRANSFO_XL_CONFIG_PATH \
|
||||||
|
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||||
|
[--finetuning_task_name XLNET_FINETUNED_TASK] \
|
||||||
|
```
|
||||||
|
|
||||||
|
## XLM
|
||||||
|
|
||||||
|
Here is an example of the conversion process for a pre-trained XLM model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
||||||
|
|
||||||
|
transformers-cli convert --model_type xlm \
|
||||||
|
--tf_checkpoint $XLM_CHECKPOINT_PATH \
|
||||||
|
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
|
||||||
|
[--config XML_CONFIG] \
|
||||||
|
[--finetuning_task_name XML_FINETUNED_TASK]
|
||||||
|
```
|
||||||
|
|
||||||
|
## T5
|
||||||
|
|
||||||
|
Here is an example of the conversion process for a pre-trained T5 model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export T5=/path/to/t5/uncased_L-12_H-768_A-12
|
||||||
|
|
||||||
|
transformers-cli convert --model_type t5 \
|
||||||
|
--tf_checkpoint $T5/t5_model.ckpt \
|
||||||
|
--config $T5/t5_config.json \
|
||||||
|
--pytorch_dump_output $T5/pytorch_model.bin
|
||||||
|
```
|
@ -1,181 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Converting Tensorflow Checkpoints
|
|
||||||
=======================================================================================================================
|
|
||||||
|
|
||||||
A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
|
|
||||||
that can be loaded using the ``from_pretrained`` methods of the library.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
|
|
||||||
transformers >= 2.3.0 installation.
|
|
||||||
|
|
||||||
The documentation below reflects the **transformers-cli convert** command format.
|
|
||||||
|
|
||||||
BERT
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
|
|
||||||
<https://github.com/google-research/bert#pre-trained-models>`_) in a PyTorch save file by using the
|
|
||||||
:prefix_link:`convert_bert_original_tf_checkpoint_to_pytorch.py
|
|
||||||
<src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.
|
|
||||||
|
|
||||||
This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``) and the associated
|
|
||||||
configuration file (``bert_config.json``), and creates a PyTorch model for this configuration, loads the weights from
|
|
||||||
the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
|
|
||||||
be imported using ``from_pretrained()`` (see example in :doc:`quicktour` , :prefix_link:`run_glue.py
|
|
||||||
<examples/pytorch/text-classification/run_glue.py>` ).
|
|
||||||
|
|
||||||
You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
|
|
||||||
checkpoint (the three files starting with ``bert_model.ckpt``) but be sure to keep the configuration file (\
|
|
||||||
``bert_config.json``) and the vocabulary file (``vocab.txt``) as these are needed for the PyTorch model too.
|
|
||||||
|
|
||||||
To run this specific conversion script you will need to have TensorFlow and PyTorch installed (``pip install
|
|
||||||
tensorflow``). The rest of the repository only requires PyTorch.
|
|
||||||
|
|
||||||
Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
|
||||||
|
|
||||||
transformers-cli convert --model_type bert \
|
|
||||||
--tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
|
|
||||||
--config $BERT_BASE_DIR/bert_config.json \
|
|
||||||
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
|
|
||||||
|
|
||||||
You can download Google's pre-trained models for the conversion `here
|
|
||||||
<https://github.com/google-research/bert#pre-trained-models>`__.
|
|
||||||
|
|
||||||
ALBERT
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
|
|
||||||
:prefix_link:`convert_albert_original_tf_checkpoint_to_pytorch.py
|
|
||||||
<src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py>` script.
|
|
||||||
|
|
||||||
The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``) and the accompanying
|
|
||||||
configuration file (``albert_config.json``), then creates and saves a PyTorch model. To run this conversion you will
|
|
||||||
need to have TensorFlow and PyTorch installed.
|
|
||||||
|
|
||||||
Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export ALBERT_BASE_DIR=/path/to/albert/albert_base
|
|
||||||
|
|
||||||
transformers-cli convert --model_type albert \
|
|
||||||
--tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
|
|
||||||
--config $ALBERT_BASE_DIR/albert_config.json \
|
|
||||||
--pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
|
|
||||||
|
|
||||||
You can download Google's pre-trained models for the conversion `here
|
|
||||||
<https://github.com/google-research/albert#pre-trained-models>`__.
|
|
||||||
|
|
||||||
OpenAI GPT
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
|
|
||||||
save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\
|
|
||||||
)
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
|
||||||
|
|
||||||
transformers-cli convert --model_type gpt \
|
|
||||||
--tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
|
||||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
|
||||||
[--config OPENAI_GPT_CONFIG] \
|
|
||||||
[--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
|
|
||||||
|
|
||||||
|
|
||||||
OpenAI GPT-2
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here
|
|
||||||
<https://github.com/openai/gpt-2>`__)
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
|
||||||
|
|
||||||
transformers-cli convert --model_type gpt2 \
|
|
||||||
--tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
|
|
||||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
|
||||||
[--config OPENAI_GPT2_CONFIG] \
|
|
||||||
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
|
|
||||||
|
|
||||||
Transformer-XL
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here
|
|
||||||
<https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__)
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
|
||||||
|
|
||||||
transformers-cli convert --model_type transfo_xl \
|
|
||||||
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
|
||||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
|
||||||
[--config TRANSFO_XL_CONFIG] \
|
|
||||||
[--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
|
|
||||||
|
|
||||||
|
|
||||||
XLNet
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Here is an example of the conversion process for a pre-trained XLNet model:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
|
||||||
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
|
||||||
|
|
||||||
transformers-cli convert --model_type xlnet \
|
|
||||||
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
|
|
||||||
--config $TRANSFO_XL_CONFIG_PATH \
|
|
||||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
|
||||||
[--finetuning_task_name XLNET_FINETUNED_TASK] \
|
|
||||||
|
|
||||||
|
|
||||||
XLM
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Here is an example of the conversion process for a pre-trained XLM model:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
|
||||||
|
|
||||||
transformers-cli convert --model_type xlm \
|
|
||||||
--tf_checkpoint $XLM_CHECKPOINT_PATH \
|
|
||||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
|
|
||||||
[--config XML_CONFIG] \
|
|
||||||
[--finetuning_task_name XML_FINETUNED_TASK]
|
|
||||||
|
|
||||||
|
|
||||||
T5
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Here is an example of the conversion process for a pre-trained T5 model:
|
|
||||||
|
|
||||||
.. code-block:: shell
|
|
||||||
|
|
||||||
export T5=/path/to/t5/uncased_L-12_H-768_A-12
|
|
||||||
|
|
||||||
transformers-cli convert --model_type t5 \
|
|
||||||
--tf_checkpoint $T5/t5_model.ckpt \
|
|
||||||
--config $T5/t5_config.json \
|
|
||||||
--pytorch_dump_output $T5/pytorch_model.bin
|
|
323
docs/source/create_a_model.mdx
Normal file
@ -0,0 +1,323 @@
|
|||||||
|
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Create a custom model
|
||||||
|
|
||||||
|
An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to:
|
||||||
|
|
||||||
|
- Load and customize a model configuration.
|
||||||
|
- Create a model architecture.
|
||||||
|
- Create a slow and fast tokenizer for text.
|
||||||
|
- Create a feature extractor for audio or image tasks.
|
||||||
|
- Create a processor for multimodal tasks.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with.
|
||||||
|
|
||||||
|
Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import DistilBertConfig
|
||||||
|
|
||||||
|
>>> config = DistilBertConfig()
|
||||||
|
>>> print(config)
|
||||||
|
DistilBertConfig {
|
||||||
|
"activation": "gelu",
|
||||||
|
"attention_dropout": 0.1,
|
||||||
|
"dim": 768,
|
||||||
|
"dropout": 0.1,
|
||||||
|
"hidden_dim": 3072,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"max_position_embeddings": 512,
|
||||||
|
"model_type": "distilbert",
|
||||||
|
"n_heads": 12,
|
||||||
|
"n_layers": 6,
|
||||||
|
"pad_token_id": 0,
|
||||||
|
"qa_dropout": 0.1,
|
||||||
|
"seq_classif_dropout": 0.2,
|
||||||
|
"sinusoidal_pos_embds": false,
|
||||||
|
"transformers_version": "4.16.2",
|
||||||
|
"vocab_size": 30522
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to:
|
||||||
|
|
||||||
|
- Try a different activation function with the `activation` parameter.
|
||||||
|
- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter.
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
|
||||||
|
>>> print(my_config)
|
||||||
|
DistilBertConfig {
|
||||||
|
"activation": "relu",
|
||||||
|
"attention_dropout": 0.4,
|
||||||
|
"dim": 768,
|
||||||
|
"dropout": 0.1,
|
||||||
|
"hidden_dim": 3072,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"max_position_embeddings": 512,
|
||||||
|
"model_type": "distilbert",
|
||||||
|
"n_heads": 12,
|
||||||
|
"n_layers": 6,
|
||||||
|
"pad_token_id": 0,
|
||||||
|
"qa_dropout": 0.1,
|
||||||
|
"seq_classif_dropout": 0.2,
|
||||||
|
"sinusoidal_pos_embds": false,
|
||||||
|
"transformers_version": "4.16.2",
|
||||||
|
"vocab_size": 30522
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
|
||||||
|
```
|
||||||
|
|
||||||
|
Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> my_config.save_pretrained(save_directory="./your_model_save_path")
|
||||||
|
```
|
||||||
|
|
||||||
|
To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
## Model
|
||||||
|
|
||||||
|
The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. This means models are compatible with each of their respective framework's usage.
|
||||||
|
|
||||||
|
Load your custom configuration attributes into the model:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import DistilBertModel
|
||||||
|
|
||||||
|
>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
|
||||||
|
>>> model = DistilBertModel(my_config)
|
||||||
|
===PT-TF-SPLIT===
|
||||||
|
>>> from transformers import TFDistilBertModel
|
||||||
|
|
||||||
|
>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
|
||||||
|
>>> tf_model = TFDistilBertModel(my_config)
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
|
||||||
|
|
||||||
|
Create a pretrained model with [`~PreTrainedModel.from_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
|
||||||
|
===PT-TF-SPLIT===
|
||||||
|
>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
|
||||||
|
===PT-TF-SPLIT===
|
||||||
|
>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Model heads
|
||||||
|
|
||||||
|
At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation).
|
||||||
|
|
||||||
|
For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import DistilBertForSequenceClassification
|
||||||
|
|
||||||
|
>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||||
|
===PT-TF-SPLIT===
|
||||||
|
>>> from transformers import TFDistilBertForSequenceClassification
|
||||||
|
|
||||||
|
>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import DistilBertForQuestionAnswering
|
||||||
|
|
||||||
|
>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
||||||
|
===PT-TF-SPLIT===
|
||||||
|
>>> from transformers import TFDistilBertForQuestionAnswering
|
||||||
|
|
||||||
|
>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tokenizer
|
||||||
|
|
||||||
|
The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers:
|
||||||
|
|
||||||
|
- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer.
|
||||||
|
- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to it's Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters.
|
||||||
|
|
||||||
|
Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens.
|
||||||
|
|
||||||
|
<Tip warning={true}>
|
||||||
|
|
||||||
|
Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
If you trained your own tokenizer, you can create one from your *vocabulary* file:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import DistilBertTokenizer
|
||||||
|
|
||||||
|
>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
|
||||||
|
```
|
||||||
|
|
||||||
|
It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import DistilBertTokenizer
|
||||||
|
|
||||||
|
>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import DistilBertTokenizerFast
|
||||||
|
|
||||||
|
>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
## Feature Extractor
|
||||||
|
|
||||||
|
A feature extractor processes audio or image inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`ImageFeatureExtractionMixin`] class for processing image features or the [`SequenceFeatureExtractor`] class for processing audio inputs.
|
||||||
|
|
||||||
|
Depending on whether you are working on an audio or vision task, create a feature extractor associated with the model you're using. For example, create a default [`ViTFeatureExtractor`] if you are using [ViT](model_doc/vit) for image classification:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import ViTFeatureExtractor
|
||||||
|
|
||||||
|
>>> vit_extractor = ViTFeatureExtractor()
|
||||||
|
>>> print(vit_extractor)
|
||||||
|
ViTFeatureExtractor {
|
||||||
|
"do_normalize": true,
|
||||||
|
"do_resize": true,
|
||||||
|
"feature_extractor_type": "ViTFeatureExtractor",
|
||||||
|
"image_mean": [
|
||||||
|
0.5,
|
||||||
|
0.5,
|
||||||
|
0.5
|
||||||
|
],
|
||||||
|
"image_std": [
|
||||||
|
0.5,
|
||||||
|
0.5,
|
||||||
|
0.5
|
||||||
|
],
|
||||||
|
"resample": 2,
|
||||||
|
"size": 224
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
Modify any of the [`ViTFeatureExtractor`] parameters to create your custom feature extractor:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import ViTFeatureExtractor
|
||||||
|
|
||||||
|
>>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
|
||||||
|
>>> print(my_vit_extractor)
|
||||||
|
ViTFeatureExtractor {
|
||||||
|
"do_normalize": false,
|
||||||
|
"do_resize": true,
|
||||||
|
"feature_extractor_type": "ViTFeatureExtractor",
|
||||||
|
"image_mean": [
|
||||||
|
0.3,
|
||||||
|
0.3,
|
||||||
|
0.3
|
||||||
|
],
|
||||||
|
"image_std": [
|
||||||
|
0.5,
|
||||||
|
0.5,
|
||||||
|
0.5
|
||||||
|
],
|
||||||
|
"resample": "PIL.Image.BOX",
|
||||||
|
"size": 224
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
For audio inputs, you can create a [`Wav2Vec2FeatureExtractor`] and customize the parameters in a similar way:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import Wav2Vec2FeatureExtractor
|
||||||
|
|
||||||
|
>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
|
||||||
|
>>> print(w2v2_extractor)
|
||||||
|
Wav2Vec2FeatureExtractor {
|
||||||
|
"do_normalize": true,
|
||||||
|
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
||||||
|
"feature_size": 1,
|
||||||
|
"padding_side": "right",
|
||||||
|
"padding_value": 0.0,
|
||||||
|
"return_attention_mask": false,
|
||||||
|
"sampling_rate": 16000
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Processor
|
||||||
|
|
||||||
|
For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps a feature extractor and tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer.
|
||||||
|
|
||||||
|
Create a feature extractor to handle the audio inputs:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import Wav2Vec2FeatureExtractor
|
||||||
|
|
||||||
|
>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a tokenizer to handle the text inputs:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import Wav2Vec2CTCTokenizer
|
||||||
|
|
||||||
|
>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
|
||||||
|
```
|
||||||
|
|
||||||
|
Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import Wav2Vec2Processor
|
||||||
|
|
||||||
|
>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
```
|
||||||
|
|
||||||
|
With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune.
|
@ -54,6 +54,7 @@ The 🤗 Datasets library makes it simple to load a dataset:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
imdb = load_dataset("imdb")
|
imdb = load_dataset("imdb")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -61,8 +62,9 @@ This loads a `DatasetDict` object which you can index into to view an example:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
imdb["train"][0]
|
imdb["train"][0]
|
||||||
{'label': 1,
|
{
|
||||||
'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'
|
"label": 1,
|
||||||
|
"text": "Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \"Teachers\". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is \"Teachers\". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!",
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -74,6 +76,7 @@ model was trained with to ensure appropriately tokenized words. Load the DistilB
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -99,6 +102,7 @@ batch. This is known as **dynamic padding**. You can do this with the `DataColla
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import DataCollatorWithPadding
|
from transformers import DataCollatorWithPadding
|
||||||
|
|
||||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -108,6 +112,7 @@ Now load your model with the [`AutoModelForSequenceClassification`] class along
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
|
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -121,7 +126,7 @@ At this point, only three steps remain:
|
|||||||
from transformers import TrainingArguments, Trainer
|
from transformers import TrainingArguments, Trainer
|
||||||
|
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir='./results',
|
output_dir="./results",
|
||||||
learning_rate=2e-5,
|
learning_rate=2e-5,
|
||||||
per_device_train_batch_size=16,
|
per_device_train_batch_size=16,
|
||||||
per_device_eval_batch_size=16,
|
per_device_eval_batch_size=16,
|
||||||
@ -150,6 +155,7 @@ Make sure you set `return_tensors="tf"` to return `tf.Tensor` outputs instead of
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import DataCollatorWithPadding
|
from transformers import DataCollatorWithPadding
|
||||||
|
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -158,14 +164,14 @@ Next, convert your datasets to the `tf.data.Dataset` format with `to_tf_dataset`
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
tf_train_dataset = tokenized_imdb["train"].to_tf_dataset(
|
tf_train_dataset = tokenized_imdb["train"].to_tf_dataset(
|
||||||
columns=['attention_mask', 'input_ids', 'label'],
|
columns=["attention_mask", "input_ids", "label"],
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
batch_size=16,
|
batch_size=16,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
)
|
)
|
||||||
|
|
||||||
tf_validation_dataset = tokenized_imdb["train"].to_tf_dataset(
|
tf_validation_dataset = tokenized_imdb["train"].to_tf_dataset(
|
||||||
columns=['attention_mask', 'input_ids', 'label'],
|
columns=["attention_mask", "input_ids", "label"],
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
batch_size=16,
|
batch_size=16,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
@ -182,17 +188,14 @@ batch_size = 16
|
|||||||
num_epochs = 5
|
num_epochs = 5
|
||||||
batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
|
batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
|
||||||
total_train_steps = int(batches_per_epoch * num_epochs)
|
total_train_steps = int(batches_per_epoch * num_epochs)
|
||||||
optimizer, schedule = create_optimizer(
|
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
|
||||||
init_lr=2e-5,
|
|
||||||
num_warmup_steps=0,
|
|
||||||
num_train_steps=total_train_steps
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Load your model with the [`TFAutoModelForSequenceClassification`] class along with the number of expected labels:
|
Load your model with the [`TFAutoModelForSequenceClassification`] class along with the number of expected labels:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import TFAutoModelForSequenceClassification
|
from transformers import TFAutoModelForSequenceClassification
|
||||||
|
|
||||||
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
|
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -200,6 +203,7 @@ Compile the model:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
model.compile(optimizer=optimizer)
|
model.compile(optimizer=optimizer)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -234,14 +238,15 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no
|
|||||||
Load the WNUT 17 dataset from the 🤗 Datasets library:
|
Load the WNUT 17 dataset from the 🤗 Datasets library:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
>>> from datasets import load_dataset
|
||||||
wnut = load_dataset("wnut_17")
|
|
||||||
|
>>> wnut = load_dataset("wnut_17")
|
||||||
```
|
```
|
||||||
|
|
||||||
A quick look at the dataset shows the labels associated with each word in the sentence:
|
A quick look at the dataset shows the labels associated with each word in the sentence:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
wnut["train"][0]
|
>>> wnut["train"][0]
|
||||||
{'id': '0',
|
{'id': '0',
|
||||||
'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
|
'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||||
'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
|
'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
|
||||||
@ -251,21 +256,22 @@ wnut["train"][0]
|
|||||||
View the specific NER tags by:
|
View the specific NER tags by:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
label_list = wnut["train"].features[f"ner_tags"].feature.names
|
>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
|
||||||
label_list
|
>>> label_list
|
||||||
['O',
|
[
|
||||||
'B-corporation',
|
"O",
|
||||||
'I-corporation',
|
"B-corporation",
|
||||||
'B-creative-work',
|
"I-corporation",
|
||||||
'I-creative-work',
|
"B-creative-work",
|
||||||
'B-group',
|
"I-creative-work",
|
||||||
'I-group',
|
"B-group",
|
||||||
'B-location',
|
"I-group",
|
||||||
'I-location',
|
"B-location",
|
||||||
'B-person',
|
"I-location",
|
||||||
'I-person',
|
"B-person",
|
||||||
'B-product',
|
"I-person",
|
||||||
'I-product'
|
"B-product",
|
||||||
|
"I-product",
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -282,6 +288,7 @@ Now you need to tokenize the text. Load the DistilBERT tokenizer with an [`AutoT
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -289,9 +296,9 @@ Since the input has already been split into words, set `is_split_into_words=True
|
|||||||
subwords:
|
subwords:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
|
>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
|
||||||
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
|
>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
|
||||||
tokens
|
>>> tokens
|
||||||
['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
|
['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -314,12 +321,14 @@ def tokenize_and_align_labels(examples):
|
|||||||
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
|
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
|
||||||
previous_word_idx = None
|
previous_word_idx = None
|
||||||
label_ids = []
|
label_ids = []
|
||||||
for word_idx in word_ids: # Set the special tokens to -100.
|
for word_idx in word_ids: # Set the special tokens to -100.
|
||||||
if word_idx is None:
|
if word_idx is None:
|
||||||
label_ids.append(-100)
|
label_ids.append(-100)
|
||||||
elif word_idx != previous_word_idx: # Only label the first token of a given word.
|
elif word_idx != previous_word_idx: # Only label the first token of a given word.
|
||||||
label_ids.append(label[word_idx])
|
label_ids.append(label[word_idx])
|
||||||
|
else:
|
||||||
|
label_ids.append(-100)
|
||||||
|
previous_word_idx = word_idx
|
||||||
labels.append(label_ids)
|
labels.append(label_ids)
|
||||||
|
|
||||||
tokenized_inputs["labels"] = labels
|
tokenized_inputs["labels"] = labels
|
||||||
@ -336,6 +345,7 @@ Finally, pad your text and labels, so they are a uniform length:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import DataCollatorForTokenClassification
|
from transformers import DataCollatorForTokenClassification
|
||||||
|
|
||||||
data_collator = DataCollatorForTokenClassification(tokenizer)
|
data_collator = DataCollatorForTokenClassification(tokenizer)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -345,6 +355,7 @@ Load your model with the [`AutoModelForTokenClassification`] class along with th
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
|
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
|
||||||
|
|
||||||
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
|
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -352,7 +363,7 @@ Gather your training arguments in [`TrainingArguments`]:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir='./results',
|
output_dir="./results",
|
||||||
evaluation_strategy="epoch",
|
evaluation_strategy="epoch",
|
||||||
learning_rate=2e-5,
|
learning_rate=2e-5,
|
||||||
per_device_train_batch_size=16,
|
per_device_train_batch_size=16,
|
||||||
@ -387,6 +398,7 @@ Batch your examples together and pad your text and labels, so they are a uniform
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import DataCollatorForTokenClassification
|
from transformers import DataCollatorForTokenClassification
|
||||||
|
|
||||||
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
|
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -412,6 +424,7 @@ Load the model with the [`TFAutoModelForTokenClassification`] class along with t
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import TFAutoModelForTokenClassification
|
from transformers import TFAutoModelForTokenClassification
|
||||||
|
|
||||||
model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
|
model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -435,6 +448,7 @@ Compile the model:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
model.compile(optimizer=optimizer)
|
model.compile(optimizer=optimizer)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -469,13 +483,14 @@ Load the SQuAD dataset from the 🤗 Datasets library:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
squad = load_dataset("squad")
|
squad = load_dataset("squad")
|
||||||
```
|
```
|
||||||
|
|
||||||
Take a look at an example from the dataset:
|
Take a look at an example from the dataset:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
squad["train"][0]
|
>>> squad["train"][0]
|
||||||
{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
|
{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
|
||||||
'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
|
'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
|
||||||
'id': '5733be284776f41900661182',
|
'id': '5733be284776f41900661182',
|
||||||
@ -490,6 +505,7 @@ Load the DistilBERT tokenizer with an [`AutoTokenizer`]:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -567,6 +583,7 @@ Batch the processed examples together:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import default_data_collator
|
from transformers import default_data_collator
|
||||||
|
|
||||||
data_collator = default_data_collator
|
data_collator = default_data_collator
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -576,6 +593,7 @@ Load your model with the [`AutoModelForQuestionAnswering`] class:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
|
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
|
||||||
|
|
||||||
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -583,7 +601,7 @@ Gather your training arguments in [`TrainingArguments`]:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir='./results',
|
output_dir="./results",
|
||||||
evaluation_strategy="epoch",
|
evaluation_strategy="epoch",
|
||||||
learning_rate=2e-5,
|
learning_rate=2e-5,
|
||||||
per_device_train_batch_size=16,
|
per_device_train_batch_size=16,
|
||||||
@ -618,6 +636,7 @@ Batch the processed examples together with a TensorFlow default data collator:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers.data.data_collator import tf_default_collator
|
from transformers.data.data_collator import tf_default_collator
|
||||||
|
|
||||||
data_collator = tf_default_collator
|
data_collator = tf_default_collator
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -650,8 +669,8 @@ batch_size = 16
|
|||||||
num_epochs = 2
|
num_epochs = 2
|
||||||
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
|
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
|
||||||
optimizer, schedule = create_optimizer(
|
optimizer, schedule = create_optimizer(
|
||||||
init_lr=2e-5,
|
init_lr=2e-5,
|
||||||
num_warmup_steps=0,
|
num_warmup_steps=0,
|
||||||
num_train_steps=total_train_steps,
|
num_train_steps=total_train_steps,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
@ -660,6 +679,7 @@ Load your model with the [`TFAutoModelForQuestionAnswering`] class:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import TFAutoModelForQuestionAnswering
|
from transformers import TFAutoModelForQuestionAnswering
|
||||||
|
|
||||||
model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
|
model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -667,6 +687,7 @@ Compile the model:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
model.compile(optimizer=optimizer)
|
model.compile(optimizer=optimizer)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
349
docs/source/custom_models.mdx
Normal file
@ -0,0 +1,349 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Sharing custom models
|
||||||
|
|
||||||
|
The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder
|
||||||
|
of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs.
|
||||||
|
|
||||||
|
If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you
|
||||||
|
how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it
|
||||||
|
with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗
|
||||||
|
Transformers library.
|
||||||
|
|
||||||
|
We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the
|
||||||
|
[timm library](https://github.com/rwightman/pytorch-image-models/tree/master/timm) into a [`PreTrainedModel`].
|
||||||
|
|
||||||
|
## Writing a custom configuration
|
||||||
|
|
||||||
|
Before we dive into the model, let's first write its configuration. The configuration of a model is an object that
|
||||||
|
will contain all the necessary information to build the model. As we will see in the next section, the model can only
|
||||||
|
take a `config` to be initialized, so we really need that object to be as complete as possible.
|
||||||
|
|
||||||
|
In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different
|
||||||
|
configurations will then give us the different types of ResNets that are possible. We then just store those arguments,
|
||||||
|
after checking the validity of a few of them.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
class ResnetConfig(PretrainedConfig):
|
||||||
|
model_type = "resnet"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
block_type="bottleneck",
|
||||||
|
layers: List[int] = [3, 4, 6, 3],
|
||||||
|
num_classes: int = 1000,
|
||||||
|
input_channels: int = 3,
|
||||||
|
cardinality: int = 1,
|
||||||
|
base_width: int = 64,
|
||||||
|
stem_width: int = 64,
|
||||||
|
stem_type: str = "",
|
||||||
|
avg_down: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
if block_type not in ["basic", "bottleneck"]:
|
||||||
|
raise ValueError(f"`block` must be 'basic' or bottleneck', got {block}.")
|
||||||
|
if stem_type not in ["", "deep", "deep-tiered"]:
|
||||||
|
raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {block}.")
|
||||||
|
|
||||||
|
self.block_type = block_type
|
||||||
|
self.layers = layers
|
||||||
|
self.num_classes = num_classes
|
||||||
|
self.input_channels = input_channels
|
||||||
|
self.cardinality = cardinality
|
||||||
|
self.base_width = base_width
|
||||||
|
self.stem_width = stem_width
|
||||||
|
self.stem_type = stem_type
|
||||||
|
self.avg_down = avg_down
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
```
|
||||||
|
|
||||||
|
The three important things to remember when writing you own configuration are the following:
|
||||||
|
- you have to inherit from `PretrainedConfig`,
|
||||||
|
- the `__init__` of your `PretrainedConfig` must accept any kwargs,
|
||||||
|
- those `kwargs` need to be passed to the superclass `__init__`.
|
||||||
|
|
||||||
|
The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other
|
||||||
|
constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a
|
||||||
|
config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the
|
||||||
|
superclass.
|
||||||
|
|
||||||
|
Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to
|
||||||
|
register your model with the auto classes (see last section).
|
||||||
|
|
||||||
|
With this done, you can easily create and save your configuration like you would do with any other model config of the
|
||||||
|
library. Here is how we can create a resnet50d config and save it:
|
||||||
|
|
||||||
|
```py
|
||||||
|
resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
|
||||||
|
resnet50d_config.save_pretrained("custom-resnet")
|
||||||
|
```
|
||||||
|
|
||||||
|
This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the
|
||||||
|
`from_pretrained` method:
|
||||||
|
|
||||||
|
```py
|
||||||
|
resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to
|
||||||
|
directly upload your config to the Hub.
|
||||||
|
|
||||||
|
## Writing a custom model
|
||||||
|
|
||||||
|
Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that
|
||||||
|
extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image
|
||||||
|
classification (like [`BertModelForSequenceClassification`]).
|
||||||
|
|
||||||
|
As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only
|
||||||
|
thing we need to do before writing this class is a map between the block types and actual block classes. Then the
|
||||||
|
model is defined from the configuration by passing everything to the `ResNet` class:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from transformers import PreTrainedModel
|
||||||
|
from timm.models.resnet import BasicBlock, Bottleneck, ResNet
|
||||||
|
from .configuration_resnet import ResnetConfig
|
||||||
|
|
||||||
|
|
||||||
|
BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
|
||||||
|
|
||||||
|
|
||||||
|
class ResnetModel(PreTrainedModel):
|
||||||
|
config_class = ResnetConfig
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
block_layer = BLOCK_MAPPING[config.block_type]
|
||||||
|
self.model = ResNet(
|
||||||
|
block_layer,
|
||||||
|
config.layers,
|
||||||
|
num_classes=config.num_classes,
|
||||||
|
in_chans=config.input_channels,
|
||||||
|
cardinality=config.cardinality,
|
||||||
|
base_width=config.base_width,
|
||||||
|
stem_width=config.stem_width,
|
||||||
|
stem_type=config.stem_type,
|
||||||
|
avg_down=config.avg_down,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, tensor):
|
||||||
|
return self.model.forward_features(tensor)
|
||||||
|
```
|
||||||
|
|
||||||
|
For the model that will classify images, we just change the forward method:
|
||||||
|
|
||||||
|
```py
|
||||||
|
class ResnetModelForImageClassification(PreTrainedModel):
|
||||||
|
config_class = ResnetConfig
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
block_layer = BLOCK_MAPPING[config.block_type]
|
||||||
|
self.model = ResNet(
|
||||||
|
block_layer,
|
||||||
|
config.layers,
|
||||||
|
num_classes=config.num_classes,
|
||||||
|
in_chans=config.input_channels,
|
||||||
|
cardinality=config.cardinality,
|
||||||
|
base_width=config.base_width,
|
||||||
|
stem_width=config.stem_width,
|
||||||
|
stem_type=config.stem_type,
|
||||||
|
avg_down=config.avg_down,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, tensor, labels=None):
|
||||||
|
logits = self.model(tensor)
|
||||||
|
if labels is not None:
|
||||||
|
loss = torch.nn.cross_entropy(logits, labels)
|
||||||
|
return {"loss": loss, "logits": logits}
|
||||||
|
return {"logits": logits}
|
||||||
|
```
|
||||||
|
|
||||||
|
In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config`
|
||||||
|
(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless
|
||||||
|
you want to register your model with the auto classes (see last section).
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
If your model is very similar to a model inside the library, you can re-use the same configuration as this model.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
You can have your model return anything you want, but returning a dictionary like we did for
|
||||||
|
`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly
|
||||||
|
usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own
|
||||||
|
training loop or another library for training.
|
||||||
|
|
||||||
|
Now that we have our model class, let's create one:
|
||||||
|
|
||||||
|
```py
|
||||||
|
resnet50d = ResnetModelForImageClassification(resnet50d_config)
|
||||||
|
```
|
||||||
|
|
||||||
|
Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or
|
||||||
|
[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights
|
||||||
|
with the code of our model. But first, let's load some pretrained weights inside our model.
|
||||||
|
|
||||||
|
In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial,
|
||||||
|
we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be
|
||||||
|
easy to transfer those weights:
|
||||||
|
|
||||||
|
```py
|
||||||
|
import timm
|
||||||
|
|
||||||
|
pretrained_model = timm.create_model("resnet50d", pretrained=True)
|
||||||
|
resnet50d.model.load_state_dict(pretrained_model.state_dict())
|
||||||
|
```
|
||||||
|
|
||||||
|
Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the
|
||||||
|
code of the model is saved.
|
||||||
|
|
||||||
|
## Sending the code to the Hub
|
||||||
|
|
||||||
|
<Tip warning={true}>
|
||||||
|
|
||||||
|
This API is experimental and may have some slight breaking changes in the next releases.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as
|
||||||
|
long as all the files are in the same directory (we don't support submodules for this feature yet). For our example,
|
||||||
|
we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working
|
||||||
|
directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file
|
||||||
|
contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
└── resnet_model
|
||||||
|
├── __init__.py
|
||||||
|
├── configuration_resnet.py
|
||||||
|
└── modeling_resnet.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module.
|
||||||
|
|
||||||
|
<Tip warning={true}>
|
||||||
|
|
||||||
|
If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file
|
||||||
|
to import from the `transformers` package.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
Note that you can re-use (or subclass) an existing configuration/model.
|
||||||
|
|
||||||
|
To share your model with the community, follow those steps: first import the ResNet model and config from the newly
|
||||||
|
created files:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from resnet_model.configuration_resnet import ResnetConfig
|
||||||
|
from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained`
|
||||||
|
method and properly register them with a given Auto class (especially for models), just run:
|
||||||
|
|
||||||
|
```py
|
||||||
|
ResnetConfig.register_for_auto_class()
|
||||||
|
ResnetModel.register_for_auto_class("AutoModel")
|
||||||
|
ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that there is no need to specify an auto class for the configuration (there is only one auto class for them,
|
||||||
|
[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you
|
||||||
|
have to specify which one of the auto classes is the correct one for your model.
|
||||||
|
|
||||||
|
Next, let's create the config and models as we did before:
|
||||||
|
|
||||||
|
```py
|
||||||
|
resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
|
||||||
|
resnet50d = ResnetModelForImageClassification(resnet50d_config)
|
||||||
|
|
||||||
|
pretrained_model = timm.create_model("resnet50d", pretrained=True)
|
||||||
|
resnet50d.model.load_state_dict(pretrained_model.state_dict())
|
||||||
|
```
|
||||||
|
|
||||||
|
Now to send the model to the Hub, make sure you are logged in. Either run in your terminal:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
huggingface-cli login
|
||||||
|
```
|
||||||
|
|
||||||
|
or from a notebook:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from huggingface_hub import notebook_login
|
||||||
|
|
||||||
|
notebook_login()
|
||||||
|
```
|
||||||
|
|
||||||
|
You can then push to to your own namespace (or an organization you are a member of) like this:
|
||||||
|
|
||||||
|
```py
|
||||||
|
resnet50d.push_to_hub("custom-resnet50d")
|
||||||
|
```
|
||||||
|
|
||||||
|
On top of the modeling weights and the configuration in json format, this also copied the modeling and
|
||||||
|
configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result
|
||||||
|
in this [model repo](https://huggingface.co/sgugger/custom-resnet50d).
|
||||||
|
|
||||||
|
See the [sharing tutorial](model_sharing) for more information on the push to Hub method.
|
||||||
|
|
||||||
|
## Using a model with custom code
|
||||||
|
|
||||||
|
You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and
|
||||||
|
the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still
|
||||||
|
review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use
|
||||||
|
a model with custom code:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from transformers import AutoModelForImageClassification
|
||||||
|
|
||||||
|
model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not
|
||||||
|
update the code with some malicious new lines (unless you fully trust the authors of the models).
|
||||||
|
|
||||||
|
```py
|
||||||
|
commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
|
||||||
|
model = AutoModelForImageClassification.from_pretrained(
|
||||||
|
"sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit
|
||||||
|
hash of any commit.
|
||||||
|
|
||||||
|
## Registering a model with custom code to the auto classes
|
||||||
|
|
||||||
|
If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
|
||||||
|
model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
|
||||||
|
get the custom models (contrarily to automatically downloading the model code from the Hub).
|
||||||
|
|
||||||
|
As long as your config has a `model_type` attribute that is different from existing model types, and that your model
|
||||||
|
classes have the right `config_class` attributes, you can just add them to the auto classes likes this:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
|
||||||
|
|
||||||
|
AutoConfig.register("resnet", ResnetConfig)
|
||||||
|
AutoModel.register(ResnetConfig, ResnetModel)
|
||||||
|
AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
|
||||||
|
of your custom config, and the first argument used when registering your custom models to any auto model class needs
|
||||||
|
to match the `config_class` of those models.
|
335
docs/source/debugging.mdx
Normal file
@ -0,0 +1,335 @@
|
|||||||
|
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Debugging
|
||||||
|
|
||||||
|
## Multi-GPU Network Issues Debug
|
||||||
|
|
||||||
|
When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://raw.githubusercontent.com/huggingface/transformers/master/scripts/distributed/torch-distributed-gpu-test.py
|
||||||
|
```
|
||||||
|
|
||||||
|
For example to test how 2 GPUs interact do:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
|
||||||
|
```
|
||||||
|
If both processes can talk to each and allocate GPU memory each will print an OK status.
|
||||||
|
|
||||||
|
For more GPUs or nodes adjust the arguments in the script.
|
||||||
|
|
||||||
|
You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment.
|
||||||
|
|
||||||
|
An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Underflow and Overflow Detection
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
This feature is currently available for PyTorch-only.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
For multi-GPU training it requires DDP (`torch.distributed.launch`).
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
This feature can be used with any `nn.Module`-based model.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
If you start getting `loss=NaN` or the model inhibits some other abnormal behavior due to `inf` or `nan` in
|
||||||
|
activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
|
||||||
|
you can accomplish that easily by activating a special module that will do the detection automatically.
|
||||||
|
|
||||||
|
If you're using [`Trainer`], you just need to add:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--debug underflow_overflow
|
||||||
|
```
|
||||||
|
|
||||||
|
to the normal command line arguments, or pass `debug="underflow_overflow"` when creating the
|
||||||
|
[`TrainingArguments`] object.
|
||||||
|
|
||||||
|
If you're using your own training loop or another Trainer you can accomplish the same with:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from .debug_utils import DebugUnderflowOverflow
|
||||||
|
|
||||||
|
debug_overflow = DebugUnderflowOverflow(model)
|
||||||
|
```
|
||||||
|
|
||||||
|
[`~debug_utils.DebugUnderflowOverflow`] inserts hooks into the model that immediately after each
|
||||||
|
forward call will test input and output variables and also the corresponding module's weights. As soon as `inf` or
|
||||||
|
`nan` is detected in at least one element of the activations or weights, the program will assert and print a report
|
||||||
|
like this (this was caught with `google/mt5-small` under fp16 mixed precision):
|
||||||
|
|
||||||
|
```
|
||||||
|
Detected inf/nan during batch_number=0
|
||||||
|
Last 21 forward frames:
|
||||||
|
abs min abs max metadata
|
||||||
|
encoder.block.1.layer.1.DenseReluDense.dropout Dropout
|
||||||
|
0.00e+00 2.57e+02 input[0]
|
||||||
|
0.00e+00 2.85e+02 output
|
||||||
|
[...]
|
||||||
|
encoder.block.2.layer.0 T5LayerSelfAttention
|
||||||
|
6.78e-04 3.15e+03 input[0]
|
||||||
|
2.65e-04 3.42e+03 output[0]
|
||||||
|
None output[1]
|
||||||
|
2.25e-01 1.00e+04 output[2]
|
||||||
|
encoder.block.2.layer.1.layer_norm T5LayerNorm
|
||||||
|
8.69e-02 4.18e-01 weight
|
||||||
|
2.65e-04 3.42e+03 input[0]
|
||||||
|
1.79e-06 4.65e+00 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
|
||||||
|
2.17e-07 4.50e+00 weight
|
||||||
|
1.79e-06 4.65e+00 input[0]
|
||||||
|
2.68e-06 3.70e+01 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
|
||||||
|
8.08e-07 2.66e+01 weight
|
||||||
|
1.79e-06 4.65e+00 input[0]
|
||||||
|
1.27e-04 2.37e+02 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense.dropout Dropout
|
||||||
|
0.00e+00 8.76e+03 input[0]
|
||||||
|
0.00e+00 9.74e+03 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense.wo Linear
|
||||||
|
1.01e-06 6.44e+00 weight
|
||||||
|
0.00e+00 9.74e+03 input[0]
|
||||||
|
3.18e-04 6.27e+04 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
|
||||||
|
1.79e-06 4.65e+00 input[0]
|
||||||
|
3.18e-04 6.27e+04 output
|
||||||
|
encoder.block.2.layer.1.dropout Dropout
|
||||||
|
3.18e-04 6.27e+04 input[0]
|
||||||
|
0.00e+00 inf output
|
||||||
|
```
|
||||||
|
|
||||||
|
The example output has been trimmed in the middle for brevity.
|
||||||
|
|
||||||
|
The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
|
||||||
|
the inputs and outputs were in the range of `1e4`. So when this training was done under fp16 mixed precision the very
|
||||||
|
last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under
|
||||||
|
`fp16` the activations must remain way below `1e4`, because `1e4 * 1e4 = 1e8` so any matrix multiplication with
|
||||||
|
large activations is going to lead to a numerical overflow condition.
|
||||||
|
|
||||||
|
At the very start of the trace you can discover at which batch number the problem occurred (here `Detected inf/nan during batch_number=0` means the problem occurred on the first batch).
|
||||||
|
|
||||||
|
Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
|
||||||
|
for. If we look just at this frame:
|
||||||
|
|
||||||
|
```
|
||||||
|
encoder.block.2.layer.1.layer_norm T5LayerNorm
|
||||||
|
8.69e-02 4.18e-01 weight
|
||||||
|
2.65e-04 3.42e+03 input[0]
|
||||||
|
1.79e-06 4.65e+00 output
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, `encoder.block.2.layer.1.layer_norm` indicates that it was a layer norm for the first layer, of the second
|
||||||
|
block of the encoder. And the specific calls of the `forward` is `T5LayerNorm`.
|
||||||
|
|
||||||
|
Let's look at the last few frames of that report:
|
||||||
|
|
||||||
|
```
|
||||||
|
Detected inf/nan during batch_number=0
|
||||||
|
Last 21 forward frames:
|
||||||
|
abs min abs max metadata
|
||||||
|
[...]
|
||||||
|
encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
|
||||||
|
2.17e-07 4.50e+00 weight
|
||||||
|
1.79e-06 4.65e+00 input[0]
|
||||||
|
2.68e-06 3.70e+01 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
|
||||||
|
8.08e-07 2.66e+01 weight
|
||||||
|
1.79e-06 4.65e+00 input[0]
|
||||||
|
1.27e-04 2.37e+02 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense.wo Linear
|
||||||
|
1.01e-06 6.44e+00 weight
|
||||||
|
0.00e+00 9.74e+03 input[0]
|
||||||
|
3.18e-04 6.27e+04 output
|
||||||
|
encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
|
||||||
|
1.79e-06 4.65e+00 input[0]
|
||||||
|
3.18e-04 6.27e+04 output
|
||||||
|
encoder.block.2.layer.1.dropout Dropout
|
||||||
|
3.18e-04 6.27e+04 input[0]
|
||||||
|
0.00e+00 inf output
|
||||||
|
```
|
||||||
|
|
||||||
|
The last frame reports for `Dropout.forward` function with the first entry for the only input and the second for the
|
||||||
|
only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see
|
||||||
|
that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
|
||||||
|
input elements was `6.27e+04` and same for the output was `inf`.
|
||||||
|
|
||||||
|
You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
|
||||||
|
around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes
|
||||||
|
the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
|
||||||
|
overflow (`inf`).
|
||||||
|
|
||||||
|
As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
|
||||||
|
numbers.
|
||||||
|
|
||||||
|
Let's match the report to the code from `models/t5/modeling_t5.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class T5DenseGatedGeluDense(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
|
||||||
|
self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
|
||||||
|
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
|
||||||
|
self.dropout = nn.Dropout(config.dropout_rate)
|
||||||
|
self.gelu_act = ACT2FN["gelu_new"]
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
|
||||||
|
hidden_linear = self.wi_1(hidden_states)
|
||||||
|
hidden_states = hidden_gelu * hidden_linear
|
||||||
|
hidden_states = self.dropout(hidden_states)
|
||||||
|
hidden_states = self.wo(hidden_states)
|
||||||
|
return hidden_states
|
||||||
|
```
|
||||||
|
|
||||||
|
Now it's easy to see the `dropout` call, and all the previous calls as well.
|
||||||
|
|
||||||
|
Since the detection is happening in a forward hook, these reports are printed immediately after each `forward`
|
||||||
|
returns.
|
||||||
|
|
||||||
|
Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
|
||||||
|
started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied
|
||||||
|
or summed up. Of course, there might be other solutions. For example, we could turn off `amp` temporarily if it's
|
||||||
|
enabled, after moving the original `forward` into a helper wrapper, like so:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _forward(self, hidden_states):
|
||||||
|
hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
|
||||||
|
hidden_linear = self.wi_1(hidden_states)
|
||||||
|
hidden_states = hidden_gelu * hidden_linear
|
||||||
|
hidden_states = self.dropout(hidden_states)
|
||||||
|
hidden_states = self.wo(hidden_states)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
if torch.is_autocast_enabled():
|
||||||
|
with torch.cuda.amp.autocast(enabled=False):
|
||||||
|
return self._forward(hidden_states)
|
||||||
|
else:
|
||||||
|
return self._forward(hidden_states)
|
||||||
|
```
|
||||||
|
|
||||||
|
Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
|
||||||
|
want to analyse the intermediary stages of any specific `forward` function as well. In such a case you can use the
|
||||||
|
`detect_overflow` helper function to inject the detector where you want it, for example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from debug_utils import detect_overflow
|
||||||
|
|
||||||
|
|
||||||
|
class T5LayerFF(nn.Module):
|
||||||
|
[...]
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
forwarded_states = self.layer_norm(hidden_states)
|
||||||
|
detect_overflow(forwarded_states, "after layer_norm")
|
||||||
|
forwarded_states = self.DenseReluDense(forwarded_states)
|
||||||
|
detect_overflow(forwarded_states, "after DenseReluDense")
|
||||||
|
return hidden_states + self.dropout(forwarded_states)
|
||||||
|
```
|
||||||
|
|
||||||
|
You can see that we added 2 of these and now we track if `inf` or `nan` for `forwarded_states` was detected
|
||||||
|
somewhere in between.
|
||||||
|
|
||||||
|
Actually, the detector already reports these because each of the calls in the example above is a `nn.Module`, but
|
||||||
|
let's say if you had some local direct calculations this is how you'd do that.
|
||||||
|
|
||||||
|
Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
|
||||||
|
its default, e.g.:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from .debug_utils import DebugUnderflowOverflow
|
||||||
|
|
||||||
|
debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Specific batch absolute mix and max value tracing
|
||||||
|
|
||||||
|
The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
|
||||||
|
|
||||||
|
Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given
|
||||||
|
batch, and only do that for batches 1 and 3. Then you instantiate this class as:
|
||||||
|
|
||||||
|
```python
|
||||||
|
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
|
||||||
|
```
|
||||||
|
|
||||||
|
And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
|
||||||
|
|
||||||
|
Batches are 0-indexed.
|
||||||
|
|
||||||
|
This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
|
||||||
|
right to that area. Here is a sample truncated output for such configuration:
|
||||||
|
|
||||||
|
```
|
||||||
|
*** Starting batch number=1 ***
|
||||||
|
abs min abs max metadata
|
||||||
|
shared Embedding
|
||||||
|
1.01e-06 7.92e+02 weight
|
||||||
|
0.00e+00 2.47e+04 input[0]
|
||||||
|
5.36e-05 7.92e+02 output
|
||||||
|
[...]
|
||||||
|
decoder.dropout Dropout
|
||||||
|
1.60e-07 2.27e+01 input[0]
|
||||||
|
0.00e+00 2.52e+01 output
|
||||||
|
decoder T5Stack
|
||||||
|
not a tensor output
|
||||||
|
lm_head Linear
|
||||||
|
1.01e-06 7.92e+02 weight
|
||||||
|
0.00e+00 1.11e+00 input[0]
|
||||||
|
6.06e-02 8.39e+01 output
|
||||||
|
T5ForConditionalGeneration
|
||||||
|
not a tensor output
|
||||||
|
|
||||||
|
*** Starting batch number=3 ***
|
||||||
|
abs min abs max metadata
|
||||||
|
shared Embedding
|
||||||
|
1.01e-06 7.92e+02 weight
|
||||||
|
0.00e+00 2.78e+04 input[0]
|
||||||
|
5.36e-05 7.92e+02 output
|
||||||
|
[...]
|
||||||
|
```
|
||||||
|
|
||||||
|
Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
|
||||||
|
not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
|
||||||
|
a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
|
||||||
|
numbers started to diverge.
|
||||||
|
|
||||||
|
You can also specify the batch number after which to stop the training, with:
|
||||||
|
|
||||||
|
```python
|
||||||
|
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
|
||||||
|
```
|
@ -1,299 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Debugging
|
|
||||||
=======================================================================================================================
|
|
||||||
|
|
||||||
Underflow and Overflow Detection
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
This feature is currently available for PyTorch-only.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
For multi-GPU training it requires DDP (``torch.distributed.launch``).
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
This feature can be used with any ``nn.Module``-based model.
|
|
||||||
|
|
||||||
If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
|
|
||||||
activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
|
|
||||||
you can accomplish that easily by activating a special module that will do the detection automatically.
|
|
||||||
|
|
||||||
If you're using :class:`~transformers.Trainer`, you just need to add:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
--debug underflow_overflow
|
|
||||||
|
|
||||||
to the normal command line arguments, or pass ``debug="underflow_overflow"`` when creating the
|
|
||||||
:class:`~transformers.TrainingArguments` object.
|
|
||||||
|
|
||||||
If you're using your own training loop or another Trainer you can accomplish the same with:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
from .debug_utils import DebugUnderflowOverflow
|
|
||||||
debug_overflow = DebugUnderflowOverflow(model)
|
|
||||||
|
|
||||||
:class:`~transformers.debug_utils.DebugUnderflowOverflow` inserts hooks into the model that immediately after each
|
|
||||||
forward call will test input and output variables and also the corresponding module's weights. As soon as ``inf`` or
|
|
||||||
``nan`` is detected in at least one element of the activations or weights, the program will assert and print a report
|
|
||||||
like this (this was caught with ``google/mt5-small`` under fp16 mixed precision):
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
Detected inf/nan during batch_number=0
|
|
||||||
Last 21 forward frames:
|
|
||||||
abs min abs max metadata
|
|
||||||
encoder.block.1.layer.1.DenseReluDense.dropout Dropout
|
|
||||||
0.00e+00 2.57e+02 input[0]
|
|
||||||
0.00e+00 2.85e+02 output
|
|
||||||
[...]
|
|
||||||
encoder.block.2.layer.0 T5LayerSelfAttention
|
|
||||||
6.78e-04 3.15e+03 input[0]
|
|
||||||
2.65e-04 3.42e+03 output[0]
|
|
||||||
None output[1]
|
|
||||||
2.25e-01 1.00e+04 output[2]
|
|
||||||
encoder.block.2.layer.1.layer_norm T5LayerNorm
|
|
||||||
8.69e-02 4.18e-01 weight
|
|
||||||
2.65e-04 3.42e+03 input[0]
|
|
||||||
1.79e-06 4.65e+00 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
|
|
||||||
2.17e-07 4.50e+00 weight
|
|
||||||
1.79e-06 4.65e+00 input[0]
|
|
||||||
2.68e-06 3.70e+01 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
|
|
||||||
8.08e-07 2.66e+01 weight
|
|
||||||
1.79e-06 4.65e+00 input[0]
|
|
||||||
1.27e-04 2.37e+02 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense.dropout Dropout
|
|
||||||
0.00e+00 8.76e+03 input[0]
|
|
||||||
0.00e+00 9.74e+03 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense.wo Linear
|
|
||||||
1.01e-06 6.44e+00 weight
|
|
||||||
0.00e+00 9.74e+03 input[0]
|
|
||||||
3.18e-04 6.27e+04 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
|
|
||||||
1.79e-06 4.65e+00 input[0]
|
|
||||||
3.18e-04 6.27e+04 output
|
|
||||||
encoder.block.2.layer.1.dropout Dropout
|
|
||||||
3.18e-04 6.27e+04 input[0]
|
|
||||||
0.00e+00 inf output
|
|
||||||
|
|
||||||
The example output has been trimmed in the middle for brevity.
|
|
||||||
|
|
||||||
The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
|
|
||||||
the inputs and outputs were in the range of ``1e4``. So when this training was done under fp16 mixed precision the very
|
|
||||||
last step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under
|
|
||||||
``fp16`` the activations must remain way below ``1e4``, because ``1e4 * 1e4 = 1e8`` so any matrix multiplication with
|
|
||||||
large activations is going to lead to a numerical overflow condition.
|
|
||||||
|
|
||||||
At the very start of the trace you can discover at which batch number the problem occurred (here ``Detected inf/nan
|
|
||||||
during batch_number=0`` means the problem occurred on the first batch).
|
|
||||||
|
|
||||||
Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
|
|
||||||
for. If we look just at this frame:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
encoder.block.2.layer.1.layer_norm T5LayerNorm
|
|
||||||
8.69e-02 4.18e-01 weight
|
|
||||||
2.65e-04 3.42e+03 input[0]
|
|
||||||
1.79e-06 4.65e+00 output
|
|
||||||
|
|
||||||
Here, ``encoder.block.2.layer.1.layer_norm`` indicates that it was a layer norm for the first layer, of the second
|
|
||||||
block of the encoder. And the specific calls of the ``forward`` is ``T5LayerNorm``.
|
|
||||||
|
|
||||||
Let's look at the last few frames of that report:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
Detected inf/nan during batch_number=0
|
|
||||||
Last 21 forward frames:
|
|
||||||
abs min abs max metadata
|
|
||||||
[...]
|
|
||||||
encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
|
|
||||||
2.17e-07 4.50e+00 weight
|
|
||||||
1.79e-06 4.65e+00 input[0]
|
|
||||||
2.68e-06 3.70e+01 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
|
|
||||||
8.08e-07 2.66e+01 weight
|
|
||||||
1.79e-06 4.65e+00 input[0]
|
|
||||||
1.27e-04 2.37e+02 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense.wo Linear
|
|
||||||
1.01e-06 6.44e+00 weight
|
|
||||||
0.00e+00 9.74e+03 input[0]
|
|
||||||
3.18e-04 6.27e+04 output
|
|
||||||
encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
|
|
||||||
1.79e-06 4.65e+00 input[0]
|
|
||||||
3.18e-04 6.27e+04 output
|
|
||||||
encoder.block.2.layer.1.dropout Dropout
|
|
||||||
3.18e-04 6.27e+04 input[0]
|
|
||||||
0.00e+00 inf output
|
|
||||||
|
|
||||||
The last frame reports for ``Dropout.forward`` function with the first entry for the only input and the second for the
|
|
||||||
only output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see
|
|
||||||
that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
|
|
||||||
input elements was ``6.27e+04`` and same for the output was ``inf``.
|
|
||||||
|
|
||||||
You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value was
|
|
||||||
around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which renormalizes
|
|
||||||
the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
|
|
||||||
overflow (``inf``).
|
|
||||||
|
|
||||||
As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
|
|
||||||
numbers.
|
|
||||||
|
|
||||||
Let's match the report to the code from ``models/t5/modeling_t5.py``:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
class T5DenseGatedGeluDense(nn.Module):
|
|
||||||
def __init__(self, config):
|
|
||||||
super().__init__()
|
|
||||||
self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
|
|
||||||
self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
|
|
||||||
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
|
|
||||||
self.dropout = nn.Dropout(config.dropout_rate)
|
|
||||||
self.gelu_act = ACT2FN["gelu_new"]
|
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
|
||||||
hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
|
|
||||||
hidden_linear = self.wi_1(hidden_states)
|
|
||||||
hidden_states = hidden_gelu * hidden_linear
|
|
||||||
hidden_states = self.dropout(hidden_states)
|
|
||||||
hidden_states = self.wo(hidden_states)
|
|
||||||
return hidden_states
|
|
||||||
|
|
||||||
Now it's easy to see the ``dropout`` call, and all the previous calls as well.
|
|
||||||
|
|
||||||
Since the detection is happening in a forward hook, these reports are printed immediately after each ``forward``
|
|
||||||
returns.
|
|
||||||
|
|
||||||
Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
|
|
||||||
started to go up and most likely switch to the ``fp32`` mode here, so that the numbers don't overflow when multiplied
|
|
||||||
or summed up. Of course, there might be other solutions. For example, we could turn off ``amp`` temporarily if it's
|
|
||||||
enabled, after moving the original ``forward`` into a helper wrapper, like so:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
def _forward(self, hidden_states):
|
|
||||||
hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
|
|
||||||
hidden_linear = self.wi_1(hidden_states)
|
|
||||||
hidden_states = hidden_gelu * hidden_linear
|
|
||||||
hidden_states = self.dropout(hidden_states)
|
|
||||||
hidden_states = self.wo(hidden_states)
|
|
||||||
return hidden_states
|
|
||||||
|
|
||||||
import torch
|
|
||||||
def forward(self, hidden_states):
|
|
||||||
if torch.is_autocast_enabled():
|
|
||||||
with torch.cuda.amp.autocast(enabled=False):
|
|
||||||
return self._forward(hidden_states)
|
|
||||||
else:
|
|
||||||
return self._forward(hidden_states)
|
|
||||||
|
|
||||||
Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
|
|
||||||
want to analyse the intermediary stages of any specific ``forward`` function as well. In such a case you can use the
|
|
||||||
``detect_overflow`` helper function to inject the detector where you want it, for example:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
from debug_utils import detect_overflow
|
|
||||||
|
|
||||||
class T5LayerFF(nn.Module):
|
|
||||||
[...]
|
|
||||||
def forward(self, hidden_states):
|
|
||||||
forwarded_states = self.layer_norm(hidden_states)
|
|
||||||
detect_overflow(forwarded_states, "after layer_norm")
|
|
||||||
forwarded_states = self.DenseReluDense(forwarded_states)
|
|
||||||
detect_overflow(forwarded_states, "after DenseReluDense")
|
|
||||||
return hidden_states + self.dropout(forwarded_states)
|
|
||||||
|
|
||||||
You can see that we added 2 of these and now we track if ``inf`` or ``nan`` for ``forwarded_states`` was detected
|
|
||||||
somewhere in between.
|
|
||||||
|
|
||||||
Actually, the detector already reports these because each of the calls in the example above is a `nn.Module``, but
|
|
||||||
let's say if you had some local direct calculations this is how you'd do that.
|
|
||||||
|
|
||||||
Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
|
|
||||||
its default, e.g.:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
from .debug_utils import DebugUnderflowOverflow
|
|
||||||
debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
|
|
||||||
|
|
||||||
Specific batch absolute mix and max value tracing
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
|
|
||||||
|
|
||||||
Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a given
|
|
||||||
batch, and only do that for batches 1 and 3. Then you instantiate this class as:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
|
|
||||||
|
|
||||||
And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
|
|
||||||
|
|
||||||
Batches are 0-indexed.
|
|
||||||
|
|
||||||
This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
|
|
||||||
right to that area. Here is a sample truncated output for such configuration:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
*** Starting batch number=1 ***
|
|
||||||
abs min abs max metadata
|
|
||||||
shared Embedding
|
|
||||||
1.01e-06 7.92e+02 weight
|
|
||||||
0.00e+00 2.47e+04 input[0]
|
|
||||||
5.36e-05 7.92e+02 output
|
|
||||||
[...]
|
|
||||||
decoder.dropout Dropout
|
|
||||||
1.60e-07 2.27e+01 input[0]
|
|
||||||
0.00e+00 2.52e+01 output
|
|
||||||
decoder T5Stack
|
|
||||||
not a tensor output
|
|
||||||
lm_head Linear
|
|
||||||
1.01e-06 7.92e+02 weight
|
|
||||||
0.00e+00 1.11e+00 input[0]
|
|
||||||
6.06e-02 8.39e+01 output
|
|
||||||
T5ForConditionalGeneration
|
|
||||||
not a tensor output
|
|
||||||
|
|
||||||
*** Starting batch number=3 ***
|
|
||||||
abs min abs max metadata
|
|
||||||
shared Embedding
|
|
||||||
1.01e-06 7.92e+02 weight
|
|
||||||
0.00e+00 2.78e+04 input[0]
|
|
||||||
5.36e-05 7.92e+02 output
|
|
||||||
[...]
|
|
||||||
|
|
||||||
Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
|
|
||||||
not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
|
|
||||||
a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
|
|
||||||
numbers started to diverge.
|
|
||||||
|
|
||||||
You can also specify the batch number after which to stop the training, with:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
|
|
@ -1 +0,0 @@
|
|||||||
../../examples/README.md
|
|
70
docs/source/fast_tokenizers.mdx
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Using tokenizers from 🤗 Tokenizers
|
||||||
|
|
||||||
|
The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
|
||||||
|
loaded very simply into 🤗 Transformers.
|
||||||
|
|
||||||
|
Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from tokenizers import Tokenizer
|
||||||
|
>>> from tokenizers.models import BPE
|
||||||
|
>>> from tokenizers.trainers import BpeTrainer
|
||||||
|
>>> from tokenizers.pre_tokenizers import Whitespace
|
||||||
|
|
||||||
|
>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
||||||
|
>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||||
|
|
||||||
|
>>> tokenizer.pre_tokenizer = Whitespace()
|
||||||
|
>>> files = [...]
|
||||||
|
>>> tokenizer.train(files, trainer)
|
||||||
|
```
|
||||||
|
|
||||||
|
We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
|
||||||
|
a JSON file for future re-use.
|
||||||
|
|
||||||
|
## Loading directly from the tokenizer object
|
||||||
|
|
||||||
|
Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
|
||||||
|
[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
|
||||||
|
*tokenizer* object as an argument:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import PreTrainedTokenizerFast
|
||||||
|
|
||||||
|
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
|
||||||
|
```
|
||||||
|
|
||||||
|
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
|
||||||
|
page](main_classes/tokenizer) for more information.
|
||||||
|
|
||||||
|
## Loading from a JSON file
|
||||||
|
|
||||||
|
In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> tokenizer.save("tokenizer.json")
|
||||||
|
```
|
||||||
|
|
||||||
|
The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
|
||||||
|
method using the `tokenizer_file` parameter:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import PreTrainedTokenizerFast
|
||||||
|
|
||||||
|
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
|
||||||
|
```
|
||||||
|
|
||||||
|
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
|
||||||
|
page](main_classes/tokenizer) for more information.
|
@ -1,62 +0,0 @@
|
|||||||
Using tokenizers from 🤗 Tokenizers
|
|
||||||
=======================================================================================================================
|
|
||||||
|
|
||||||
The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers
|
|
||||||
<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be
|
|
||||||
loaded very simply into 🤗 Transformers.
|
|
||||||
|
|
||||||
Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> from tokenizers import Tokenizer
|
|
||||||
>>> from tokenizers.models import BPE
|
|
||||||
>>> from tokenizers.trainers import BpeTrainer
|
|
||||||
>>> from tokenizers.pre_tokenizers import Whitespace
|
|
||||||
|
|
||||||
>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
|
||||||
>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
|
||||||
|
|
||||||
>>> tokenizer.pre_tokenizer = Whitespace()
|
|
||||||
>>> files = [...]
|
|
||||||
>>> tokenizer.train(files, trainer)
|
|
||||||
|
|
||||||
We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
|
|
||||||
a JSON file for future re-use.
|
|
||||||
|
|
||||||
Loading directly from the tokenizer object
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
|
|
||||||
:class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated
|
|
||||||
`tokenizer` object as an argument:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> from transformers import PreTrainedTokenizerFast
|
|
||||||
|
|
||||||
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
|
|
||||||
|
|
||||||
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
|
|
||||||
page <main_classes/tokenizer>` for more information.
|
|
||||||
|
|
||||||
Loading from a JSON file
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> tokenizer.save("tokenizer.json")
|
|
||||||
|
|
||||||
The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization
|
|
||||||
method using the :obj:`tokenizer_file` parameter:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> from transformers import PreTrainedTokenizerFast
|
|
||||||
|
|
||||||
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
|
|
||||||
|
|
||||||
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
|
|
||||||
page <main_classes/tokenizer>` for more information.
|
|
300
docs/source/glossary.mdx
Normal file
@ -0,0 +1,300 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Glossary
|
||||||
|
|
||||||
|
## General terms
|
||||||
|
|
||||||
|
- autoencoding models: see MLM
|
||||||
|
- autoregressive models: see CLM
|
||||||
|
- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
|
||||||
|
next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
|
||||||
|
tokens at a certain timestep.
|
||||||
|
- deep learning: machine learning algorithms which uses neural networks with several layers.
|
||||||
|
- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
|
||||||
|
by masking some tokens randomly, and has to predict the original text.
|
||||||
|
- multimodal: a task that combines texts with another kind of inputs (for instance images).
|
||||||
|
- NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
|
||||||
|
translation).
|
||||||
|
- NLP: natural language processing, a generic way to say "deal with texts".
|
||||||
|
- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
|
||||||
|
the whole text, individual words).
|
||||||
|
- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
|
||||||
|
involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
|
||||||
|
masking some words and trying to predict them (see MLM).
|
||||||
|
- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
|
||||||
|
- self-attention: each element of the input finds out which other elements of the input they should attend to.
|
||||||
|
- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
|
||||||
|
summarization models (such as [Bart](model_doc/bart) or [T5](model_doc/t5)).
|
||||||
|
- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
|
||||||
|
or a punctuation symbol.
|
||||||
|
- transformer: self-attention based deep learning model architecture.
|
||||||
|
|
||||||
|
## Model inputs
|
||||||
|
|
||||||
|
Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
|
||||||
|
detailed here alongside usage examples.
|
||||||
|
|
||||||
|
<a id='input-ids'></a>
|
||||||
|
|
||||||
|
### Input IDs
|
||||||
|
|
||||||
|
The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
|
||||||
|
numerical representations of tokens building the sequences that will be used as input by the model*.
|
||||||
|
|
||||||
|
<Youtube id="VFp38yj8h3A"/>
|
||||||
|
|
||||||
|
Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
|
||||||
|
tokenizer, which is a [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) tokenizer:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import BertTokenizer
|
||||||
|
|
||||||
|
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||||
|
|
||||||
|
>>> sequence = "A Titan RTX has 24GB of VRAM"
|
||||||
|
```
|
||||||
|
|
||||||
|
The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> tokenized_sequence = tokenizer.tokenize(sequence)
|
||||||
|
```
|
||||||
|
|
||||||
|
The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
|
||||||
|
in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
|
||||||
|
is added for "RA" and "M":
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> print(tokenized_sequence)
|
||||||
|
['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
|
||||||
|
```
|
||||||
|
|
||||||
|
These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
|
||||||
|
the sentence to the tokenizer, which leverages the Rust implementation of [🤗 Tokenizers](https://github.com/huggingface/tokenizers) for peak performance.
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> inputs = tokenizer(sequence)
|
||||||
|
```
|
||||||
|
|
||||||
|
The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
|
||||||
|
token indices are under the key "input_ids":
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> encoded_sequence = inputs["input_ids"]
|
||||||
|
>>> print(encoded_sequence)
|
||||||
|
[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
|
||||||
|
IDs the model sometimes uses.
|
||||||
|
|
||||||
|
If we decode the previous sequence of ids,
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> decoded_sequence = tokenizer.decode(encoded_sequence)
|
||||||
|
```
|
||||||
|
|
||||||
|
we will see
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> print(decoded_sequence)
|
||||||
|
[CLS] A Titan RTX has 24GB of VRAM [SEP]
|
||||||
|
```
|
||||||
|
|
||||||
|
because this is the way a [`BertModel`] is going to expect its inputs.
|
||||||
|
|
||||||
|
<a id='attention-mask'></a>
|
||||||
|
|
||||||
|
### Attention mask
|
||||||
|
|
||||||
|
The attention mask is an optional argument used when batching sequences together.
|
||||||
|
|
||||||
|
<Youtube id="M6adb1j2jPI"/>
|
||||||
|
|
||||||
|
This argument indicates to the model which tokens should be attended to, and which should not.
|
||||||
|
|
||||||
|
For example, consider these two sequences:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import BertTokenizer
|
||||||
|
|
||||||
|
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||||
|
|
||||||
|
>>> sequence_a = "This is a short sequence."
|
||||||
|
>>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
|
||||||
|
|
||||||
|
>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
|
||||||
|
>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
|
||||||
|
```
|
||||||
|
|
||||||
|
The encoded versions have different lengths:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> len(encoded_sequence_a), len(encoded_sequence_b)
|
||||||
|
(8, 19)
|
||||||
|
```
|
||||||
|
|
||||||
|
Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
|
||||||
|
of the second one, or the second one needs to be truncated down to the length of the first one.
|
||||||
|
|
||||||
|
In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
|
||||||
|
it to pad like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> padded_sequences["input_ids"]
|
||||||
|
[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
|
||||||
|
```
|
||||||
|
|
||||||
|
This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
|
||||||
|
position of the padded indices so that the model does not attend to them. For the [`BertTokenizer`],
|
||||||
|
`1` indicates a value that should be attended to, while `0` indicates a padded value. This attention mask is
|
||||||
|
in the dictionary returned by the tokenizer under the key "attention_mask":
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> padded_sequences["attention_mask"]
|
||||||
|
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
|
||||||
|
```
|
||||||
|
|
||||||
|
<a id='token-type-ids'></a>
|
||||||
|
|
||||||
|
### Token Type IDs
|
||||||
|
|
||||||
|
Some models' purpose is to do classification on pairs of sentences or question answering.
|
||||||
|
|
||||||
|
<Youtube id="0u3ioSwev3s"/>
|
||||||
|
|
||||||
|
These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
|
||||||
|
help of special tokens, such as the classifier (`[CLS]`) and separator (`[SEP]`) tokens. For example, the BERT
|
||||||
|
model builds its two sequence input as such:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
|
||||||
|
```
|
||||||
|
|
||||||
|
We can use our tokenizer to automatically generate such a sentence by passing the two sequences to `tokenizer` as two
|
||||||
|
arguments (and not a list, like before) like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import BertTokenizer
|
||||||
|
|
||||||
|
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||||
|
>>> sequence_a = "HuggingFace is based in NYC"
|
||||||
|
>>> sequence_b = "Where is HuggingFace based?"
|
||||||
|
|
||||||
|
>>> encoded_dict = tokenizer(sequence_a, sequence_b)
|
||||||
|
>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
|
||||||
|
```
|
||||||
|
|
||||||
|
which will return:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> print(decoded)
|
||||||
|
[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
|
||||||
|
```
|
||||||
|
|
||||||
|
This is enough for some models to understand where one sequence ends and where another begins. However, other models,
|
||||||
|
such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
|
||||||
|
the two types of sequence in the model.
|
||||||
|
|
||||||
|
The tokenizer returns this mask as the "token_type_ids" entry:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> encoded_dict["token_type_ids"]
|
||||||
|
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||||
|
```
|
||||||
|
|
||||||
|
The first sequence, the "context" used for the question, has all its tokens represented by a `0`, whereas the
|
||||||
|
second sequence, corresponding to the "question", has all its tokens represented by a `1`.
|
||||||
|
|
||||||
|
Some models, like [`XLNetModel`] use an additional token represented by a `2`.
|
||||||
|
|
||||||
|
<a id='position-ids'></a>
|
||||||
|
|
||||||
|
### Position IDs
|
||||||
|
|
||||||
|
Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
|
||||||
|
each token. Therefore, the position IDs (`position_ids`) are used by the model to identify each token's position in
|
||||||
|
the list of tokens.
|
||||||
|
|
||||||
|
They are an optional parameter. If no `position_ids` are passed to the model, the IDs are automatically created as
|
||||||
|
absolute positional embeddings.
|
||||||
|
|
||||||
|
Absolute positional embeddings are selected in the range `[0, config.max_position_embeddings - 1]`. Some models use
|
||||||
|
other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
|
||||||
|
|
||||||
|
<a id='labels'></a>
|
||||||
|
|
||||||
|
### Labels
|
||||||
|
|
||||||
|
The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
|
||||||
|
should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
|
||||||
|
predictions and the expected value (the label).
|
||||||
|
|
||||||
|
These labels are different according to the model head, for example:
|
||||||
|
|
||||||
|
- For sequence classification models (e.g., [`BertForSequenceClassification`]), the model expects a
|
||||||
|
tensor of dimension `(batch_size)` with each value of the batch corresponding to the expected label of the
|
||||||
|
entire sequence.
|
||||||
|
- For token classification models (e.g., [`BertForTokenClassification`]), the model expects a tensor
|
||||||
|
of dimension `(batch_size, seq_length)` with each value corresponding to the expected label of each individual
|
||||||
|
token.
|
||||||
|
- For masked language modeling (e.g., [`BertForMaskedLM`]), the model expects a tensor of dimension
|
||||||
|
`(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
|
||||||
|
labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
|
||||||
|
- For sequence to sequence tasks,(e.g., [`BartForConditionalGeneration`],
|
||||||
|
[`MBartForConditionalGeneration`]), the model expects a tensor of dimension `(batch_size, tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
|
||||||
|
training, both *BART* and *T5* will make the appropriate *decoder_input_ids* and decoder attention masks internally.
|
||||||
|
They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
|
||||||
|
the documentation of each model for more information on each specific model's labels.
|
||||||
|
|
||||||
|
The base models (e.g., [`BertModel`]) do not accept labels, as these are the base transformer
|
||||||
|
models, simply outputting features.
|
||||||
|
|
||||||
|
<a id='decoder-input-ids'></a>
|
||||||
|
|
||||||
|
### Decoder input IDs
|
||||||
|
|
||||||
|
This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
|
||||||
|
inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
|
||||||
|
way specific to each model.
|
||||||
|
|
||||||
|
Most encoder-decoder models (BART, T5) create their `decoder_input_ids` on their own from the `labels`. In
|
||||||
|
such models, passing the `labels` is the preferred way to handle training.
|
||||||
|
|
||||||
|
Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
|
||||||
|
|
||||||
|
<a id='feed-forward-chunking'></a>
|
||||||
|
|
||||||
|
### Feed Forward Chunking
|
||||||
|
|
||||||
|
In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
|
||||||
|
The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
|
||||||
|
`bert-base-uncased`).
|
||||||
|
|
||||||
|
For an input of size `[batch_size, sequence_length]`, the memory required to store the intermediate feed forward
|
||||||
|
embeddings `[batch_size, sequence_length, config.intermediate_size]` can account for a large fraction of the memory
|
||||||
|
use. The authors of [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) noticed that since the
|
||||||
|
computation is independent of the `sequence_length` dimension, it is mathematically equivalent to compute the output
|
||||||
|
embeddings of both feed forward layers `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n`
|
||||||
|
individually and concat them afterward to `[batch_size, sequence_length, config.hidden_size]` with `n = sequence_length`, which trades increased computation time against reduced memory use, but yields a mathematically
|
||||||
|
**equivalent** result.
|
||||||
|
|
||||||
|
For models employing the function [`apply_chunking_to_forward`], the `chunk_size` defines the
|
||||||
|
number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
|
||||||
|
complexity. If `chunk_size` is set to 0, no feed forward chunking is done.
|
@ -1,322 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Glossary
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
General terms
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
- autoencoding models: see MLM
|
|
||||||
- autoregressive models: see CLM
|
|
||||||
- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
|
|
||||||
next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
|
|
||||||
tokens at a certain timestep.
|
|
||||||
- deep learning: machine learning algorithms which uses neural networks with several layers.
|
|
||||||
- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
|
|
||||||
by masking some tokens randomly, and has to predict the original text.
|
|
||||||
- multimodal: a task that combines texts with another kind of inputs (for instance images).
|
|
||||||
- NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
|
|
||||||
translation).
|
|
||||||
- NLP: natural language processing, a generic way to say "deal with texts".
|
|
||||||
- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
|
|
||||||
the whole text, individual words).
|
|
||||||
- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
|
|
||||||
involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
|
|
||||||
masking some words and trying to predict them (see MLM).
|
|
||||||
- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
|
|
||||||
- self-attention: each element of the input finds out which other elements of the input they should attend to.
|
|
||||||
- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
|
|
||||||
summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
|
|
||||||
- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
|
|
||||||
or a punctuation symbol.
|
|
||||||
- transformer: self-attention based deep learning model architecture.
|
|
||||||
|
|
||||||
Model inputs
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
|
|
||||||
detailed here alongside usage examples.
|
|
||||||
|
|
||||||
.. _input-ids:
|
|
||||||
|
|
||||||
Input IDs
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
|
|
||||||
numerical representations of tokens building the sequences that will be used as input by the model*.
|
|
||||||
|
|
||||||
.. raw:: html
|
|
||||||
|
|
||||||
<iframe width="560" height="315" src="https://www.youtube.com/embed/VFp38yj8h3A" title="YouTube video player"
|
|
||||||
frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
|
|
||||||
picture-in-picture" allowfullscreen></iframe>
|
|
||||||
|
|
||||||
Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
|
|
||||||
tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> from transformers import BertTokenizer
|
|
||||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
|
||||||
|
|
||||||
>>> sequence = "A Titan RTX has 24GB of VRAM"
|
|
||||||
|
|
||||||
The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> tokenized_sequence = tokenizer.tokenize(sequence)
|
|
||||||
|
|
||||||
The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
|
|
||||||
in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
|
|
||||||
is added for "RA" and "M":
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> print(tokenized_sequence)
|
|
||||||
['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
|
|
||||||
|
|
||||||
These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
|
|
||||||
the sentence to the tokenizer, which leverages the Rust implementation of `huggingface/tokenizers
|
|
||||||
<https://github.com/huggingface/tokenizers>`__ for peak performance.
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> inputs = tokenizer(sequence)
|
|
||||||
|
|
||||||
The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
|
|
||||||
token indices are under the key "input_ids":
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> encoded_sequence = inputs["input_ids"]
|
|
||||||
>>> print(encoded_sequence)
|
|
||||||
[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
|
|
||||||
|
|
||||||
Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
|
|
||||||
IDs the model sometimes uses.
|
|
||||||
|
|
||||||
If we decode the previous sequence of ids,
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> decoded_sequence = tokenizer.decode(encoded_sequence)
|
|
||||||
|
|
||||||
we will see
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> print(decoded_sequence)
|
|
||||||
[CLS] A Titan RTX has 24GB of VRAM [SEP]
|
|
||||||
|
|
||||||
because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
|
|
||||||
|
|
||||||
.. _attention-mask:
|
|
||||||
|
|
||||||
Attention mask
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
The attention mask is an optional argument used when batching sequences together.
|
|
||||||
|
|
||||||
.. raw:: html
|
|
||||||
|
|
||||||
<iframe width="560" height="315" src="https://www.youtube.com/embed/M6adb1j2jPI" title="YouTube video player"
|
|
||||||
frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
|
|
||||||
picture-in-picture" allowfullscreen></iframe>
|
|
||||||
|
|
||||||
This argument indicates to the model which tokens should be attended to, and which should not.
|
|
||||||
|
|
||||||
For example, consider these two sequences:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> from transformers import BertTokenizer
|
|
||||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
|
||||||
|
|
||||||
>>> sequence_a = "This is a short sequence."
|
|
||||||
>>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
|
|
||||||
|
|
||||||
>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
|
|
||||||
>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
|
|
||||||
|
|
||||||
The encoded versions have different lengths:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> len(encoded_sequence_a), len(encoded_sequence_b)
|
|
||||||
(8, 19)
|
|
||||||
|
|
||||||
Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
|
|
||||||
of the second one, or the second one needs to be truncated down to the length of the first one.
|
|
||||||
|
|
||||||
In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
|
|
||||||
it to pad like this:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
|
|
||||||
|
|
||||||
We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> padded_sequences["input_ids"]
|
|
||||||
[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
|
|
||||||
|
|
||||||
This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
|
|
||||||
position of the padded indices so that the model does not attend to them. For the :class:`~transformers.BertTokenizer`,
|
|
||||||
:obj:`1` indicates a value that should be attended to, while :obj:`0` indicates a padded value. This attention mask is
|
|
||||||
in the dictionary returned by the tokenizer under the key "attention_mask":
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> padded_sequences["attention_mask"]
|
|
||||||
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
|
|
||||||
|
|
||||||
.. _token-type-ids:
|
|
||||||
|
|
||||||
Token Type IDs
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Some models' purpose is to do classification on pairs of sentences or question answering.
|
|
||||||
|
|
||||||
.. raw:: html
|
|
||||||
|
|
||||||
<iframe width="560" height="315" src="https://www.youtube.com/embed/0u3ioSwev3s" title="YouTube video player"
|
|
||||||
frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
|
|
||||||
picture-in-picture" allowfullscreen></iframe>
|
|
||||||
|
|
||||||
These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
|
|
||||||
help of special tokens, such as the classifier (``[CLS]``) and separator (``[SEP]``) tokens. For example, the BERT
|
|
||||||
model builds its two sequence input as such:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
|
|
||||||
|
|
||||||
We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
|
|
||||||
arguments (and not a list, like before) like this:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> from transformers import BertTokenizer
|
|
||||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
|
||||||
>>> sequence_a = "HuggingFace is based in NYC"
|
|
||||||
>>> sequence_b = "Where is HuggingFace based?"
|
|
||||||
|
|
||||||
>>> encoded_dict = tokenizer(sequence_a, sequence_b)
|
|
||||||
>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
|
|
||||||
|
|
||||||
which will return:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> print(decoded)
|
|
||||||
[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
|
|
||||||
|
|
||||||
This is enough for some models to understand where one sequence ends and where another begins. However, other models,
|
|
||||||
such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
|
|
||||||
the two types of sequence in the model.
|
|
||||||
|
|
||||||
The tokenizer returns this mask as the "token_type_ids" entry:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
>>> encoded_dict['token_type_ids']
|
|
||||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
||||||
|
|
||||||
The first sequence, the "context" used for the question, has all its tokens represented by a :obj:`0`, whereas the
|
|
||||||
second sequence, corresponding to the "question", has all its tokens represented by a :obj:`1`.
|
|
||||||
|
|
||||||
Some models, like :class:`~transformers.XLNetModel` use an additional token represented by a :obj:`2`.
|
|
||||||
|
|
||||||
.. _position-ids:
|
|
||||||
|
|
||||||
Position IDs
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
|
|
||||||
each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in
|
|
||||||
the list of tokens.
|
|
||||||
|
|
||||||
They are an optional parameter. If no ``position_ids`` are passed to the model, the IDs are automatically created as
|
|
||||||
absolute positional embeddings.
|
|
||||||
|
|
||||||
Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models use
|
|
||||||
other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
|
|
||||||
|
|
||||||
.. _labels:
|
|
||||||
|
|
||||||
Labels
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
|
|
||||||
should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
|
|
||||||
predictions and the expected value (the label).
|
|
||||||
|
|
||||||
These labels are different according to the model head, for example:
|
|
||||||
|
|
||||||
- For sequence classification models (e.g., :class:`~transformers.BertForSequenceClassification`), the model expects a
|
|
||||||
tensor of dimension :obj:`(batch_size)` with each value of the batch corresponding to the expected label of the
|
|
||||||
entire sequence.
|
|
||||||
- For token classification models (e.g., :class:`~transformers.BertForTokenClassification`), the model expects a tensor
|
|
||||||
of dimension :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual
|
|
||||||
token.
|
|
||||||
- For masked language modeling (e.g., :class:`~transformers.BertForMaskedLM`), the model expects a tensor of dimension
|
|
||||||
:obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
|
|
||||||
labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
|
|
||||||
- For sequence to sequence tasks,(e.g., :class:`~transformers.BartForConditionalGeneration`,
|
|
||||||
:class:`~transformers.MBartForConditionalGeneration`), the model expects a tensor of dimension :obj:`(batch_size,
|
|
||||||
tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
|
|
||||||
training, both `BART` and `T5` will make the appropriate `decoder_input_ids` and decoder attention masks internally.
|
|
||||||
They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
|
|
||||||
the documentation of each model for more information on each specific model's labels.
|
|
||||||
|
|
||||||
The base models (e.g., :class:`~transformers.BertModel`) do not accept labels, as these are the base transformer
|
|
||||||
models, simply outputting features.
|
|
||||||
|
|
||||||
.. _decoder-input-ids:
|
|
||||||
|
|
||||||
Decoder input IDs
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
|
|
||||||
inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
|
|
||||||
way specific to each model.
|
|
||||||
|
|
||||||
Most encoder-decoder models (BART, T5) create their :obj:`decoder_input_ids` on their own from the :obj:`labels`. In
|
|
||||||
such models, passing the :obj:`labels` is the preferred way to handle training.
|
|
||||||
|
|
||||||
Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
|
|
||||||
|
|
||||||
.. _feed-forward-chunking:
|
|
||||||
|
|
||||||
Feed Forward Chunking
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
|
|
||||||
The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
|
|
||||||
``bert-base-uncased``).
|
|
||||||
|
|
||||||
For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
|
|
||||||
embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
|
|
||||||
use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the
|
|
||||||
computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
|
|
||||||
embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``
|
|
||||||
individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n =
|
|
||||||
sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically
|
|
||||||
**equivalent** result.
|
|
||||||
|
|
||||||
For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
|
|
||||||
number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
|
|
||||||
complexity. If ``chunk_size`` is set to 0, no feed forward chunking is done.
|
|
Before Width: | Height: | Size: 78 KiB |
Before Width: | Height: | Size: 27 KiB |
Before Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 342 KiB |
Before Width: | Height: | Size: 47 KiB |
Before Width: | Height: | Size: 20 KiB |
Before Width: | Height: | Size: 5.7 KiB |
Before Width: | Height: | Size: 126 KiB |
Before Width: | Height: | Size: 162 KiB |
Before Width: | Height: | Size: 99 KiB |
Before Width: | Height: | Size: 54 KiB |
Before Width: | Height: | Size: 159 KiB |
Before Width: | Height: | Size: 71 KiB |
Before Width: | Height: | Size: 352 KiB |
Before Width: | Height: | Size: 418 KiB |
Before Width: | Height: | Size: 373 KiB |
Before Width: | Height: | Size: 32 KiB |
Before Width: | Height: | Size: 8.7 KiB |
Before Width: | Height: | Size: 691 KiB |
Before Width: | Height: | Size: 9.7 KiB |
Before Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 16 KiB |
@ -12,25 +12,18 @@ specific language governing permissions and limitations under the License.
|
|||||||
|
|
||||||
# 🤗 Transformers
|
# 🤗 Transformers
|
||||||
|
|
||||||
State-of-the-art Machine Learning for Jax, Pytorch and TensorFlow
|
State-of-the-art Machine Learning for PyTorch, TensorFlow and JAX.
|
||||||
|
|
||||||
🤗 Transformers (formerly known as _pytorch-transformers_ and _pytorch-pretrained-bert_) provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
|
🤗 Transformers provides APIs to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you time from training a model from scratch. The models can be used across different modalities such as:
|
||||||
|
|
||||||
These models can applied on:
|
* 📝 Text: text classification, information extraction, question answering, summarization, translation, and text generation in over 100 languages.
|
||||||
|
* 🖼️ Images: image classification, object detection, and segmentation.
|
||||||
|
* 🗣️ Audio: speech recognition and audio classification.
|
||||||
|
* 🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
|
||||||
|
|
||||||
* 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, text generation, in over 100 languages.
|
Our library supports seamless integration between three of the most popular deep learning libraries: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) and [JAX](https://jax.readthedocs.io/en/latest/). Train your model in three lines of code in one framework, and load it for inference with another.
|
||||||
* 🖼️ Images, for tasks like image classification, object detection, and segmentation.
|
|
||||||
* 🗣️ Audio, for tasks like speech recognition and audio classification.
|
|
||||||
|
|
||||||
Transformer models can also perform tasks on **several modalities combined**, such as table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
|
Each 🤗 Transformers architecture is defined in a standalone Python module so they can be easily customized for research and experiments.
|
||||||
|
|
||||||
🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.
|
|
||||||
|
|
||||||
🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.
|
|
||||||
|
|
||||||
This is the documentation of our repository [transformers](https://github.com/huggingface/transformers). You can
|
|
||||||
also follow our [online course](https://huggingface.co/course) that teaches how to use this library, as well as the
|
|
||||||
other libraries developed by Hugging Face and the Hub.
|
|
||||||
|
|
||||||
## If you are looking for custom support from the Hugging Face team
|
## If you are looking for custom support from the Hugging Face team
|
||||||
|
|
||||||
@ -38,35 +31,6 @@ other libraries developed by Hugging Face and the Hub.
|
|||||||
<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||||
</a><br>
|
</a><br>
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
1. Easy-to-use state-of-the-art models:
|
|
||||||
- High performance on natural language understanding & generation, computer vision, and audio tasks.
|
|
||||||
- Low barrier to entry for educators and practitioners.
|
|
||||||
- Few user-facing abstractions with just three classes to learn.
|
|
||||||
- A unified API for using all our pretrained models.
|
|
||||||
|
|
||||||
1. Lower compute costs, smaller carbon footprint:
|
|
||||||
- Researchers can share trained models instead of always retraining.
|
|
||||||
- Practitioners can reduce compute time and production costs.
|
|
||||||
- Dozens of architectures with over 20,000 pretrained models, some in more than 100 languages.
|
|
||||||
|
|
||||||
1. Choose the right framework for every part of a model's lifetime:
|
|
||||||
- Train state-of-the-art models in 3 lines of code.
|
|
||||||
- Move a single model between TF2.0/PyTorch/JAX frameworks at will.
|
|
||||||
- Seamlessly pick the right framework for training, evaluation and production.
|
|
||||||
|
|
||||||
1. Easily customize a model or an example to your needs:
|
|
||||||
- We provide examples for each architecture to reproduce the results published by its original authors.
|
|
||||||
- Model internals are exposed as consistently as possible.
|
|
||||||
- Model files can be used independently of the library for quick experiments.
|
|
||||||
|
|
||||||
[All the model checkpoints](https://huggingface.co/models) are seamlessly integrated from the huggingface.co [model
|
|
||||||
hub](https://huggingface.co) where they are uploaded directly by [users](https://huggingface.co/users) and
|
|
||||||
[organizations](https://huggingface.co/organizations).
|
|
||||||
|
|
||||||
Current number of checkpoints: <img src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen">
|
|
||||||
|
|
||||||
## Contents
|
## Contents
|
||||||
|
|
||||||
The documentation is organized in five parts:
|
The documentation is organized in five parts:
|
||||||
@ -91,38 +55,40 @@ conversion utilities for the following models.
|
|||||||
<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
|
<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
|
||||||
|
|
||||||
1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
|
||||||
1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
|
||||||
1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
|
||||||
1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
|
||||||
1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||||
1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
|
||||||
1. **[BERT For Sequence Generation](model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[BigBird-RoBERTa](model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
|
||||||
1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BlenderbotSmall](model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
|
||||||
1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
|
||||||
1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
|
||||||
1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||||
1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
|
||||||
|
1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
|
||||||
1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
|
||||||
1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
|
||||||
1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
|
||||||
1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||||
|
1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
|
||||||
1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeBERTa-v2](model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
|
||||||
1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
|
||||||
1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
|
||||||
1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
|
||||||
1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation) and a German version of DistilBERT.
|
1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation) and a German version of DistilBERT.
|
||||||
1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
|
||||||
1. **[EncoderDecoder](model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
|
||||||
1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
|
||||||
1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||||
1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
|
||||||
1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
|
||||||
1. **[GPT](model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||||
1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
|
||||||
1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
|
||||||
@ -139,21 +105,26 @@ conversion utilities for the following models.
|
|||||||
1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
|
||||||
1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
|
||||||
1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
|
||||||
|
1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
|
||||||
1. **[MBart](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
1. **[MBart](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||||
1. **[MBart-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
1. **[MBart-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
|
||||||
1. **[Megatron-BERT](model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
|
||||||
1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
|
||||||
1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
|
||||||
|
1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
|
||||||
1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
|
||||||
1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
|
||||||
1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
|
||||||
|
1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
|
||||||
|
1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
|
||||||
1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
|
||||||
|
1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
|
||||||
1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
|
||||||
1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
|
||||||
1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
|
||||||
1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
|
||||||
1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
|
||||||
1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||||
1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
|
||||||
@ -161,21 +132,30 @@ conversion utilities for the following models.
|
|||||||
1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
|
||||||
1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
|
||||||
1. **[SqueezeBert](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
1. **[SqueezeBert](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
|
||||||
|
1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
|
||||||
1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||||
1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
|
||||||
1. **[Transformer-XL](model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||||
1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
|
||||||
1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
|
||||||
1. **[UniSpeechSat](model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
|
||||||
|
1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||||
1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||||
|
1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
|
||||||
1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||||
|
1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
|
||||||
1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
|
||||||
|
1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
|
||||||
1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
|
||||||
1. **[XLM-ProphetNet](model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
|
||||||
1. **[XLM-RoBERTa](model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||||
|
1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
|
||||||
1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||||
1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
|
||||||
|
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
|
||||||
|
1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
|
||||||
|
|
||||||
|
|
||||||
### Supported frameworks
|
### Supported frameworks
|
||||||
@ -187,7 +167,7 @@ Flax), PyTorch, and/or TensorFlow.
|
|||||||
<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
|
<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
|
||||||
|
|
||||||
| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
|
| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
|
||||||
|-----------------------------|----------------|----------------|-----------------|--------------------|--------------|
|
|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
|
||||||
| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| ALBERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| BART | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| BART | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ |
|
| BEiT | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||||
@ -199,9 +179,12 @@ Flax), PyTorch, and/or TensorFlow.
|
|||||||
| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| Canine | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| Canine | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| CLIP | ✅ | ✅ | ✅ | ❌ | ✅ |
|
| CLIP | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
| ConvNext | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||||
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| DeiT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
@ -227,45 +210,57 @@ Flax), PyTorch, and/or TensorFlow.
|
|||||||
| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| Marian | ✅ | ❌ | ✅ | ✅ | ✅ |
|
| Marian | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||||
|
| MaskFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| mBART | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| mBART | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| MegatronBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| mT5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
| Nystromformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| Pegasus | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| Perceiver | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
| PLBart | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
| PoolFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| QDQBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| RAG | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| RAG | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
| Realm | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| RemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| RoFormer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| SEW | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| SEW | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||||
| Speech2Text | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| Speech2Text | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ |
|
| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
|
| Swin | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| T5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| T5 | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| Vision Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
| ViLT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
| Vision Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||||
| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
| VisionTextDualEncoder | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||||
| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| VisualBert | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| ViT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
| ViT | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||||
|
| ViTMAE | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ |
|
| Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||||
|
| WavLM | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ |
|
||||||
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
| XLM-RoBERTa-XL | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
|
||||||
<!-- End table-->
|
<!-- End table-->
|
||||||
|
@ -1,198 +0,0 @@
|
|||||||
<!---
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
-->
|
|
||||||
|
|
||||||
# Installation
|
|
||||||
|
|
||||||
🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.
|
|
||||||
|
|
||||||
You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
|
|
||||||
unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
|
|
||||||
to use and activate it.
|
|
||||||
|
|
||||||
Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you
|
|
||||||
must install it from source.
|
|
||||||
|
|
||||||
## Installation with pip
|
|
||||||
|
|
||||||
First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
|
|
||||||
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available),
|
|
||||||
[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or
|
|
||||||
[Flax installation page](https://github.com/google/flax#quick-install)
|
|
||||||
regarding the specific install command for your platform.
|
|
||||||
|
|
||||||
When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install transformers
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, for CPU-support only, you can install 🤗 Transformers and PyTorch in one line with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install transformers[torch]
|
|
||||||
```
|
|
||||||
|
|
||||||
or 🤗 Transformers and TensorFlow 2.0 in one line with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install transformers[tf-cpu]
|
|
||||||
```
|
|
||||||
|
|
||||||
or 🤗 Transformers and Flax in one line with:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install transformers[flax]
|
|
||||||
```
|
|
||||||
|
|
||||||
To check 🤗 Transformers is properly installed, run the following command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
|
|
||||||
```
|
|
||||||
|
|
||||||
It should download a pretrained model then print something like
|
|
||||||
|
|
||||||
```bash
|
|
||||||
[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
|
|
||||||
```
|
|
||||||
|
|
||||||
(Note that TensorFlow will print additional stuff before that last statement.)
|
|
||||||
|
|
||||||
## Installing from source
|
|
||||||
|
|
||||||
Here is how to quickly install `transformers` from source:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install git+https://github.com/huggingface/transformers
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that this will install not the latest released version, but the bleeding edge `master` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't been yet rolled out.
|
|
||||||
|
|
||||||
While we strive to keep `master` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/transformers/issues) and this way, things will get fixed even sooner.
|
|
||||||
|
|
||||||
Again, you can run:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
|
|
||||||
```
|
|
||||||
|
|
||||||
to check 🤗 Transformers is properly installed.
|
|
||||||
|
|
||||||
## Editable install
|
|
||||||
|
|
||||||
If you want to constantly use the bleeding edge `master` version of the source code, or if you want to contribute to the library and need to test the changes in the code you're making, you will need an editable install. This is done by cloning the repository and installing with the following commands:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
git clone https://github.com/huggingface/transformers.git
|
|
||||||
cd transformers
|
|
||||||
pip install -e .
|
|
||||||
```
|
|
||||||
|
|
||||||
This command performs a magical link between the folder you cloned the repository to and your python library paths, and it'll look inside this folder in addition to the normal library-wide paths. So if normally your python packages get installed into:
|
|
||||||
```
|
|
||||||
~/anaconda3/envs/main/lib/python3.7/site-packages/
|
|
||||||
```
|
|
||||||
now this editable install will reside where you clone the folder to, e.g. `~/transformers/` and python will search it too.
|
|
||||||
|
|
||||||
Do note that you have to keep that `transformers` folder around and not delete it to continue using the `transformers` library.
|
|
||||||
|
|
||||||
Now, let's get to the real benefit of this installation approach. Say, you saw some new feature has been just committed into `master`. If you have already performed all the steps above, to update your transformers to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
|
|
||||||
|
|
||||||
```
|
|
||||||
cd ~/transformers/
|
|
||||||
git pull
|
|
||||||
```
|
|
||||||
|
|
||||||
There is nothing else to do. Your python environment will find the bleeding edge version of `transformers` on the next run.
|
|
||||||
|
|
||||||
|
|
||||||
## With conda
|
|
||||||
|
|
||||||
Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
|
|
||||||
|
|
||||||
🤗 Transformers can be installed using conda as follows:
|
|
||||||
|
|
||||||
```
|
|
||||||
conda install -c huggingface transformers
|
|
||||||
```
|
|
||||||
|
|
||||||
Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
|
|
||||||
|
|
||||||
## Caching models
|
|
||||||
|
|
||||||
This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
|
|
||||||
`cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
|
|
||||||
folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
|
|
||||||
Face cache home followed by ``/transformers/``. This is (by order of priority):
|
|
||||||
|
|
||||||
* shell environment variable ``HF_HOME``
|
|
||||||
* shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
|
|
||||||
* default: ``~/.cache/huggingface/``
|
|
||||||
|
|
||||||
So if you don't have any specific environment variable set, the cache directory will be at
|
|
||||||
``~/.cache/huggingface/transformers/``.
|
|
||||||
|
|
||||||
**Note:** If you have set a shell environment variable for one of the predecessors of this library
|
|
||||||
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
|
|
||||||
environment variable for ``TRANSFORMERS_CACHE``.
|
|
||||||
|
|
||||||
### Offline mode
|
|
||||||
|
|
||||||
It's possible to run 🤗 Transformers in a firewalled or a no-network environment.
|
|
||||||
|
|
||||||
Setting environment variable `TRANSFORMERS_OFFLINE=1` will tell 🤗 Transformers to use local files only and will not try to look things up.
|
|
||||||
|
|
||||||
Most likely you may want to couple this with `HF_DATASETS_OFFLINE=1` that performs the same for 🤗 Datasets if you're using the latter.
|
|
||||||
|
|
||||||
Here is an example of how this can be used on a filesystem that is shared between a normally networked and a firewalled to the external world instances.
|
|
||||||
|
|
||||||
On the instance with the normal network run your program which will download and cache models (and optionally datasets if you use 🤗 Datasets). For example:
|
|
||||||
|
|
||||||
```
|
|
||||||
python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
|
||||||
```
|
|
||||||
|
|
||||||
and then with the same filesystem you can now run the same program on a firewalled instance:
|
|
||||||
```
|
|
||||||
HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
|
|
||||||
python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
|
||||||
```
|
|
||||||
and it should succeed without any hanging waiting to timeout.
|
|
||||||
|
|
||||||
#### Fetching models and tokenizers to use offline
|
|
||||||
|
|
||||||
When running a script the first time like mentioned above, the downloaded files will be cached for future reuse.
|
|
||||||
However, it is also possible to download files and point to their local path instead.
|
|
||||||
|
|
||||||
Downloading files can be done through the Web Interface by clicking on the "Download" button, but it can also be handled
|
|
||||||
programmatically using the `huggingface_hub` library that is a dependency to `transformers`:
|
|
||||||
|
|
||||||
- Using `snapshot_download` to download an entire repository
|
|
||||||
- Using `hf_hub_download` to download a specific file
|
|
||||||
|
|
||||||
See the reference for these methods in the huggingface_hub
|
|
||||||
[documentation](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub).
|
|
||||||
|
|
||||||
## Do you want to run a Transformer model on a mobile device?
|
|
||||||
|
|
||||||
You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
|
|
||||||
|
|
||||||
It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
|
|
||||||
`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
|
|
||||||
|
|
||||||
At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
|
|
||||||
TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
|
|
||||||
hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
|
|
235
docs/source/installation.mdx
Normal file
@ -0,0 +1,235 @@
|
|||||||
|
<!---
|
||||||
|
Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Installation
|
||||||
|
|
||||||
|
Install 🤗 Transformers for whichever deep learning library you're working with, setup your cache, and optionally configure 🤗 Transformers to run offline.
|
||||||
|
|
||||||
|
🤗 Transformers is tested on Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, and Flax. Follow the installation instructions below for the deep learning library you are using:
|
||||||
|
|
||||||
|
* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
|
||||||
|
* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
|
||||||
|
* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
|
||||||
|
|
||||||
|
## Install with pip
|
||||||
|
|
||||||
|
You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
|
||||||
|
|
||||||
|
Start by creating a virtual environment in your project directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m venv .env
|
||||||
|
```
|
||||||
|
|
||||||
|
Activate the virtual environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source .env/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
Now you're ready to install 🤗 Transformers with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install transformers
|
||||||
|
```
|
||||||
|
|
||||||
|
For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install transformers[torch]
|
||||||
|
```
|
||||||
|
|
||||||
|
🤗 Transformers and TensorFlow 2.0:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install transformers[tf-cpu]
|
||||||
|
```
|
||||||
|
|
||||||
|
🤗 Transformers and Flax:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install transformers[flax]
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, check if 🤗 Transformers has been properly installed by running the following command. It will download a pretrained model:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then print out the label and score:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Install from source
|
||||||
|
|
||||||
|
Install 🤗 Transformers from source with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install git+https://github.com/huggingface/transformers
|
||||||
|
```
|
||||||
|
|
||||||
|
This command installs the bleeding edge `master` version rather than the latest `stable` version. The `master` version is useful for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet. However, this means the `master` version may not always be stable. We strive to keep the `master` version operational, and most issues are usually resolved within a few hours or a day. If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
|
||||||
|
|
||||||
|
Check if 🤗 Transformers has been properly installed by running the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Editable install
|
||||||
|
|
||||||
|
You will need an editable install if you'd like to:
|
||||||
|
|
||||||
|
* Use the `master` version of the source code.
|
||||||
|
* Contribute to 🤗 Transformers and need to test changes in the code.
|
||||||
|
|
||||||
|
Clone the repository and install 🤗 Transformers with the following commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers.git
|
||||||
|
cd transformers
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
These commands will link the folder you cloned the repository to and your Python library paths. Python will now look inside the folder you cloned to in addition to the normal library paths. For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/transformers/`.
|
||||||
|
|
||||||
|
<Tip warning={true}>
|
||||||
|
|
||||||
|
You must keep the `transformers` folder if you want to keep using the library.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
Now you can easily update your clone to the latest version of 🤗 Transformers with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/transformers/
|
||||||
|
git pull
|
||||||
|
```
|
||||||
|
|
||||||
|
Your Python environment will find the `master` version of 🤗 Transformers on the next run.
|
||||||
|
|
||||||
|
## Install with conda
|
||||||
|
|
||||||
|
Install from the conda channel `huggingface`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda install -c huggingface transformers
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cache setup
|
||||||
|
|
||||||
|
Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/transformers/`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\transformers`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
|
||||||
|
|
||||||
|
1. Shell environment variable (default): `TRANSFORMERS_CACHE`.
|
||||||
|
2. Shell environment variable: `HF_HOME` + `transformers/`.
|
||||||
|
3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface/transformers`.
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
🤗 Transformers will use the shell environment variables `PYTORCH_TRANSFORMERS_CACHE` or `PYTORCH_PRETRAINED_BERT_CACHE` if you are coming from an earlier iteration of this library and have set those environment variables, unless you specify the shell environment variable `TRANSFORMERS_CACHE`.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
## Offline mode
|
||||||
|
|
||||||
|
🤗 Transformers is able to run in a firewalled or offline environment by only using local files. Set the environment variable `TRANSFORMERS_OFFLINE=1` to enable this behavior.
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline training workflow by setting the environment variable `HF_DATASETS_OFFLINE=1`.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
|
For example, you would typically run a program on a normal network firewalled to external instances with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Run this same program in an offline instance with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
|
||||||
|
python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
|
||||||
|
```
|
||||||
|
|
||||||
|
The script should now run without hanging or waiting to timeout because it knows it should only look for local files.
|
||||||
|
|
||||||
|
### Fetch models and tokenizers to use offline
|
||||||
|
|
||||||
|
Another option for using 🤗 Transformers offline is to download the files ahead of time, and then point to their local path when you need to use them offline. There are three ways to do this:
|
||||||
|
|
||||||
|
* Download a file through the user interface on the [Model Hub](https://huggingface.co/models) by clicking on the ↓ icon.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
* Use the [`PreTrainedModel.from_pretrained`] and [`PreTrainedModel.save_pretrained`] workflow:
|
||||||
|
|
||||||
|
1. Download your files ahead of time with [`PreTrainedModel.from_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||||
|
|
||||||
|
>>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
|
||||||
|
>>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Save your files to a specified directory with [`PreTrainedModel.save_pretrained`]:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> tokenizer.save_pretrained("./your/path/bigscience_t0")
|
||||||
|
>>> model.save_pretrained("./your/path/bigscience_t0")
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Now when you're offline, reload your files with [`PreTrainedModel.from_pretrained`] from the specified directory:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
|
||||||
|
>>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
|
||||||
|
```
|
||||||
|
|
||||||
|
* Programmatically download files with the [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) library:
|
||||||
|
|
||||||
|
1. Install the `huggingface_hub` library in your virtual environment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pip install huggingface_hub
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Use the [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) function to download a file to a specific path. For example, the following command downloads the `config.json` file from the [T0](https://huggingface.co/bigscience/T0_3B) model to your desired path:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from huggingface_hub import hf_hub_download
|
||||||
|
|
||||||
|
>>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
|
||||||
|
```
|
||||||
|
|
||||||
|
Once your file is downloaded and locally cached, specify it's local path to load and use it:
|
||||||
|
|
||||||
|
```py
|
||||||
|
>>> from transformers import AutoConfig
|
||||||
|
|
||||||
|
>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
|
||||||
|
```
|
||||||
|
|
||||||
|
<Tip>
|
||||||
|
|
||||||
|
See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub.
|
||||||
|
|
||||||
|
</Tip>
|
46
docs/source/internal/file_utils.mdx
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# General Utilities
|
||||||
|
|
||||||
|
This page lists all of Transformers general utility functions that are found in the file `file_utils.py`.
|
||||||
|
|
||||||
|
Most of those are only useful if you are studying the general code in the library.
|
||||||
|
|
||||||
|
|
||||||
|
## Enums and namedtuples
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.ExplicitEnum
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.PaddingStrategy
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.TensorType
|
||||||
|
|
||||||
|
## Special Decorators
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.add_start_docstrings
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.add_start_docstrings_to_model_forward
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.add_end_docstrings
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.add_code_sample_docstrings
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.replace_return_docstrings
|
||||||
|
|
||||||
|
## Special Properties
|
||||||
|
|
||||||
|
[[autodoc]] file_utils.cached_property
|
||||||
|
|
||||||
|
## Other Utilities
|
||||||
|
|
||||||
|
[[autodoc]] file_utils._LazyModule
|
@ -1,54 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
General Utilities
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
|
|
||||||
|
|
||||||
Most of those are only useful if you are studying the general code in the library.
|
|
||||||
|
|
||||||
|
|
||||||
Enums and namedtuples
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.file_utils.ExplicitEnum
|
|
||||||
|
|
||||||
.. autoclass:: transformers.file_utils.PaddingStrategy
|
|
||||||
|
|
||||||
.. autoclass:: transformers.file_utils.TensorType
|
|
||||||
|
|
||||||
|
|
||||||
Special Decorators
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autofunction:: transformers.file_utils.add_start_docstrings
|
|
||||||
|
|
||||||
.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
|
|
||||||
|
|
||||||
.. autofunction:: transformers.file_utils.add_end_docstrings
|
|
||||||
|
|
||||||
.. autofunction:: transformers.file_utils.add_code_sample_docstrings
|
|
||||||
|
|
||||||
.. autofunction:: transformers.file_utils.replace_return_docstrings
|
|
||||||
|
|
||||||
|
|
||||||
Special Properties
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.file_utils.cached_property
|
|
||||||
|
|
||||||
|
|
||||||
Other Utilities
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.file_utils._LazyModule
|
|
254
docs/source/internal/generation_utils.mdx
Normal file
@ -0,0 +1,254 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Utilities for Generation
|
||||||
|
|
||||||
|
This page lists all the utility functions used by [`~generation_utils.GenerationMixin.generate`],
|
||||||
|
[`~generation_utils.GenerationMixin.greedy_search`],
|
||||||
|
[`~generation_utils.GenerationMixin.sample`],
|
||||||
|
[`~generation_utils.GenerationMixin.beam_search`],
|
||||||
|
[`~generation_utils.GenerationMixin.beam_sample`],
|
||||||
|
[`~generation_utils.GenerationMixin.group_beam_search`], and
|
||||||
|
[`~generation_utils.GenerationMixin.constrained_beam_search`].
|
||||||
|
|
||||||
|
Most of those are only useful if you are studying the code of the generate methods in the library.
|
||||||
|
|
||||||
|
## Generate Outputs
|
||||||
|
|
||||||
|
The output of [`~generation_utils.GenerationMixin.generate`] is an instance of a subclass of
|
||||||
|
[`~file_utils.ModelOutput`]. This output is a data structure containing all the information returned
|
||||||
|
by [`~generation_utils.GenerationMixin.generate`], but that can also be used as tuple or dictionary.
|
||||||
|
|
||||||
|
Here's an example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||||
|
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
||||||
|
model = GPT2LMHeadModel.from_pretrained("gpt2")
|
||||||
|
|
||||||
|
inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
|
||||||
|
generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `generation_output` object is a [`~generation_utils.GreedySearchDecoderOnlyOutput`], as we can
|
||||||
|
see in the documentation of that class below, it means it has the following attributes:
|
||||||
|
|
||||||
|
- `sequences`: the generated sequences of tokens
|
||||||
|
- `scores` (optional): the prediction scores of the language modelling head, for each generation step
|
||||||
|
- `hidden_states` (optional): the hidden states of the model, for each generation step
|
||||||
|
- `attentions` (optional): the attention weights of the model, for each generation step
|
||||||
|
|
||||||
|
Here we have the `scores` since we passed along `output_scores=True`, but we don't have `hidden_states` and
|
||||||
|
`attentions` because we didn't pass `output_hidden_states=True` or `output_attentions=True`.
|
||||||
|
|
||||||
|
You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
|
||||||
|
will get `None`. Here for instance `generation_output.scores` are all the generated prediction scores of the
|
||||||
|
language modeling head, and `generation_output.attentions` is `None`.
|
||||||
|
|
||||||
|
When using our `generation_output` object as a tuple, it only keeps the attributes that don't have `None` values.
|
||||||
|
Here, for instance, it has two elements, `loss` then `logits`, so
|
||||||
|
|
||||||
|
```python
|
||||||
|
generation_output[:2]
|
||||||
|
```
|
||||||
|
|
||||||
|
will return the tuple `(generation_output.sequences, generation_output.scores)` for instance.
|
||||||
|
|
||||||
|
When using our `generation_output` object as a dictionary, it only keeps the attributes that don't have `None`
|
||||||
|
values. Here, for instance, it has two keys that are `sequences` and `scores`.
|
||||||
|
|
||||||
|
We document here all output types.
|
||||||
|
|
||||||
|
|
||||||
|
### GreedySearchOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.GreedySearchDecoderOnlyOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.GreedySearchEncoderDecoderOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_flax_utils.FlaxGreedySearchOutput
|
||||||
|
|
||||||
|
### SampleOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.SampleDecoderOnlyOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.SampleEncoderDecoderOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_flax_utils.FlaxSampleOutput
|
||||||
|
|
||||||
|
### BeamSearchOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.BeamSearchDecoderOnlyOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.BeamSearchEncoderDecoderOutput
|
||||||
|
|
||||||
|
### BeamSampleOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.BeamSampleDecoderOnlyOutput
|
||||||
|
|
||||||
|
[[autodoc]] generation_utils.BeamSampleEncoderDecoderOutput
|
||||||
|
|
||||||
|
## LogitsProcessor
|
||||||
|
|
||||||
|
A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
|
||||||
|
generation.
|
||||||
|
|
||||||
|
[[autodoc]] LogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] LogitsProcessorList
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] LogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] MinLengthLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TemperatureLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] RepetitionPenaltyLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TopPLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TopKLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] NoRepeatNGramLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] NoBadWordsLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] PrefixConstrainedLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] HammingDiversityLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] ForcedBOSTokenLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] ForcedEOSTokenLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] InfNanRemoveLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFLogitsProcessorList
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFTemperatureLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFTopPLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFTopKLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFMinLengthLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFNoBadWordsLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFNoRepeatNGramLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] TFRepetitionPenaltyLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxLogitsProcessorList
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxTemperatureLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxTopPLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxTopKLogitsWarper
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] FlaxMinLengthLogitsProcessor
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
## StoppingCriteria
|
||||||
|
|
||||||
|
A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token).
|
||||||
|
|
||||||
|
[[autodoc]] StoppingCriteria
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] StoppingCriteriaList
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] MaxLengthCriteria
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
[[autodoc]] MaxTimeCriteria
|
||||||
|
- __call__
|
||||||
|
|
||||||
|
## Constraints
|
||||||
|
|
||||||
|
A [`Constraint`] can be used to force the generation to include specific tokens or sequences in the output.
|
||||||
|
|
||||||
|
[[autodoc]] Constraint
|
||||||
|
|
||||||
|
[[autodoc]] PhrasalConstraint
|
||||||
|
|
||||||
|
[[autodoc]] DisjunctiveConstraint
|
||||||
|
|
||||||
|
[[autodoc]] ConstraintListState
|
||||||
|
|
||||||
|
## BeamSearch
|
||||||
|
|
||||||
|
[[autodoc]] BeamScorer
|
||||||
|
- process
|
||||||
|
- finalize
|
||||||
|
|
||||||
|
[[autodoc]] BeamSearchScorer
|
||||||
|
- process
|
||||||
|
- finalize
|
||||||
|
|
||||||
|
[[autodoc]] ConstrainedBeamSearchScorer
|
||||||
|
- process
|
||||||
|
- finalize
|
||||||
|
|
||||||
|
## Utilities
|
||||||
|
|
||||||
|
[[autodoc]] top_k_top_p_filtering
|
||||||
|
|
||||||
|
[[autodoc]] tf_top_k_top_p_filtering
|
@ -1,230 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Utilities for Generation
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
This page lists all the utility functions used by :meth:`~transformers.generation_utils.GenerationMixin.generate`,
|
|
||||||
:meth:`~transformers.generation_utils.GenerationMixin.greedy_search`,
|
|
||||||
:meth:`~transformers.generation_utils.GenerationMixin.sample`,
|
|
||||||
:meth:`~transformers.generation_utils.GenerationMixin.beam_search`,
|
|
||||||
:meth:`~transformers.generation_utils.GenerationMixin.beam_sample`, and
|
|
||||||
:meth:`~transformers.generation_utils.GenerationMixin.group_beam_search`.
|
|
||||||
|
|
||||||
Most of those are only useful if you are studying the code of the generate methods in the library.
|
|
||||||
|
|
||||||
Generate Outputs
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
The output of :meth:`~transformers.generation_utils.GenerationMixin.generate` is an instance of a subclass of
|
|
||||||
:class:`~transformers.file_utils.ModelOutput`. This output is a data structure containing all the information returned
|
|
||||||
by :meth:`~transformers.generation_utils.GenerationMixin.generate`, but that can also be used as tuple or dictionary.
|
|
||||||
|
|
||||||
Here's an example:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
|
||||||
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
||||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
|
||||||
|
|
||||||
inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
|
|
||||||
generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
|
|
||||||
|
|
||||||
The ``generation_output`` object is a :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, as we can
|
|
||||||
see in the documentation of that class below, it means it has the following attributes:
|
|
||||||
|
|
||||||
- ``sequences``: the generated sequences of tokens
|
|
||||||
- ``scores`` (optional): the prediction scores of the language modelling head, for each generation step
|
|
||||||
- ``hidden_states`` (optional): the hidden states of the model, for each generation step
|
|
||||||
- ``attentions`` (optional): the attention weights of the model, for each generation step
|
|
||||||
|
|
||||||
Here we have the ``scores`` since we passed along ``output_scores=True``, but we don't have ``hidden_states`` and
|
|
||||||
``attentions`` because we didn't pass ``output_hidden_states=True`` or ``output_attentions=True``.
|
|
||||||
|
|
||||||
You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
|
|
||||||
will get ``None``. Here for instance ``generation_output.scores`` are all the generated prediction scores of the
|
|
||||||
language modeling head, and ``generation_output.attentions`` is ``None``.
|
|
||||||
|
|
||||||
When using our ``generation_output`` object as a tuple, it only keeps the attributes that don't have ``None`` values.
|
|
||||||
Here, for instance, it has two elements, ``loss`` then ``logits``, so
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
generation_output[:2]
|
|
||||||
|
|
||||||
will return the tuple ``(generation_output.sequences, generation_output.scores)`` for instance.
|
|
||||||
|
|
||||||
When using our ``generation_output`` object as a dictionary, it only keeps the attributes that don't have ``None``
|
|
||||||
values. Here, for instance, it has two keys that are ``sequences`` and ``scores``.
|
|
||||||
|
|
||||||
We document here all output types.
|
|
||||||
|
|
||||||
|
|
||||||
GreedySearchOutput
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.GreedySearchDecoderOnlyOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.GreedySearchEncoderDecoderOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_flax_utils.FlaxGreedySearchOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
SampleOutput
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.SampleDecoderOnlyOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.SampleEncoderDecoderOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_flax_utils.FlaxSampleOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
BeamSearchOutput
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.BeamSearchDecoderOnlyOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.BeamSearchEncoderDecoderOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
BeamSampleOutput
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.BeamSampleDecoderOnlyOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.generation_utils.BeamSampleEncoderDecoderOutput
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
LogitsProcessor
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
A :class:`~transformers.LogitsProcessor` can be used to modify the prediction scores of a language model head for
|
|
||||||
generation.
|
|
||||||
|
|
||||||
.. autoclass:: transformers.LogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.LogitsProcessorList
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.LogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.MinLengthLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.TemperatureLogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.RepetitionPenaltyLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.TopPLogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.TopKLogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.NoRepeatNGramLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.NoBadWordsLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.PrefixConstrainedLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.HammingDiversityLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.ForcedBOSTokenLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.ForcedEOSTokenLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.InfNanRemoveLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxLogitsProcessorList
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxLogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxTemperatureLogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxTopPLogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxTopKLogitsWarper
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxForcedBOSTokenLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxForcedEOSTokenLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.FlaxMinLengthLogitsProcessor
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
|
|
||||||
StoppingCriteria
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
A :class:`~transformers.StoppingCriteria` can be used to change when to stop generation (other than EOS token).
|
|
||||||
|
|
||||||
.. autoclass:: transformers.StoppingCriteria
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.StoppingCriteriaList
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.MaxLengthCriteria
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
.. autoclass:: transformers.MaxTimeCriteria
|
|
||||||
:members: __call__
|
|
||||||
|
|
||||||
BeamSearch
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.BeamScorer
|
|
||||||
:members: process, finalize
|
|
||||||
|
|
||||||
.. autoclass:: transformers.BeamSearchScorer
|
|
||||||
:members: process, finalize
|
|
||||||
|
|
||||||
Utilities
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autofunction:: transformers.top_k_top_p_filtering
|
|
||||||
|
|
||||||
.. autofunction:: transformers.tf_top_k_top_p_filtering
|
|
82
docs/source/internal/modeling_utils.mdx
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Custom Layers and Utilities
|
||||||
|
|
||||||
|
This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
|
||||||
|
|
||||||
|
Most of those are only useful if you are studying the code of the models in the library.
|
||||||
|
|
||||||
|
|
||||||
|
## Pytorch custom modules
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.Conv1D
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.PoolerStartLogits
|
||||||
|
- forward
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.PoolerEndLogits
|
||||||
|
- forward
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.PoolerAnswerClass
|
||||||
|
- forward
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.SquadHeadOutput
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.SQuADHead
|
||||||
|
- forward
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.SequenceSummary
|
||||||
|
- forward
|
||||||
|
|
||||||
|
## PyTorch Helper Functions
|
||||||
|
|
||||||
|
[[autodoc]] apply_chunking_to_forward
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.find_pruneable_heads_and_indices
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.prune_layer
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.prune_conv1d_layer
|
||||||
|
|
||||||
|
[[autodoc]] modeling_utils.prune_linear_layer
|
||||||
|
|
||||||
|
## TensorFlow custom layers
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFConv1D
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFSharedEmbeddings
|
||||||
|
- call
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFSequenceSummary
|
||||||
|
|
||||||
|
## TensorFlow loss functions
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
|
||||||
|
|
||||||
|
## TensorFlow Helper Functions
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.get_initializer
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.keras_serializable
|
||||||
|
|
||||||
|
[[autodoc]] modeling_tf_utils.shape_list
|
@ -1,97 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Custom Layers and Utilities
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
|
|
||||||
|
|
||||||
Most of those are only useful if you are studying the code of the models in the library.
|
|
||||||
|
|
||||||
|
|
||||||
Pytorch custom modules
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_utils.Conv1D
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_utils.PoolerStartLogits
|
|
||||||
:members: forward
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_utils.PoolerEndLogits
|
|
||||||
:members: forward
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_utils.PoolerAnswerClass
|
|
||||||
:members: forward
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_utils.SquadHeadOutput
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_utils.SQuADHead
|
|
||||||
:members: forward
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_utils.SequenceSummary
|
|
||||||
:members: forward
|
|
||||||
|
|
||||||
|
|
||||||
PyTorch Helper Functions
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autofunction:: transformers.apply_chunking_to_forward
|
|
||||||
|
|
||||||
.. autofunction:: transformers.modeling_utils.find_pruneable_heads_and_indices
|
|
||||||
|
|
||||||
.. autofunction:: transformers.modeling_utils.prune_layer
|
|
||||||
|
|
||||||
.. autofunction:: transformers.modeling_utils.prune_conv1d_layer
|
|
||||||
|
|
||||||
.. autofunction:: transformers.modeling_utils.prune_linear_layer
|
|
||||||
|
|
||||||
TensorFlow custom layers
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFConv1D
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFSharedEmbeddings
|
|
||||||
:members: call
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
|
|
||||||
|
|
||||||
|
|
||||||
TensorFlow loss functions
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFCausalLanguageModelingLoss
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFMaskedLanguageModelingLoss
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFMultipleChoiceLoss
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFQuestionAnsweringLoss
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFSequenceClassificationLoss
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.modeling_tf_utils.TFTokenClassificationLoss
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
TensorFlow Helper Functions
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autofunction:: transformers.modeling_tf_utils.get_initializer
|
|
||||||
|
|
||||||
.. autofunction:: transformers.modeling_tf_utils.keras_serializable
|
|
||||||
|
|
||||||
.. autofunction:: transformers.modeling_tf_utils.shape_list
|
|
40
docs/source/internal/pipelines_utils.mdx
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Utilities for pipelines
|
||||||
|
|
||||||
|
This page lists all the utility functions the library provides for pipelines.
|
||||||
|
|
||||||
|
Most of those are only useful if you are studying the code of the models in the library.
|
||||||
|
|
||||||
|
|
||||||
|
## Argument handling
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.ArgumentHandler
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.QuestionAnsweringArgumentHandler
|
||||||
|
|
||||||
|
## Data format
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.PipelineDataFormat
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.CsvPipelineDataFormat
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.JsonPipelineDataFormat
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.PipedPipelineDataFormat
|
||||||
|
|
||||||
|
## Utilities
|
||||||
|
|
||||||
|
[[autodoc]] pipelines.PipelineException
|
@ -1,50 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Utilities for pipelines
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
This page lists all the utility functions the library provides for pipelines.
|
|
||||||
|
|
||||||
Most of those are only useful if you are studying the code of the models in the library.
|
|
||||||
|
|
||||||
|
|
||||||
Argument handling
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.ArgumentHandler
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
|
|
||||||
|
|
||||||
|
|
||||||
Data format
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.PipelineDataFormat
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.CsvPipelineDataFormat
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.JsonPipelineDataFormat
|
|
||||||
:members:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.PipedPipelineDataFormat
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
Utilities
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.pipelines.PipelineException
|
|
38
docs/source/internal/tokenization_utils.mdx
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Utilities for Tokenizers
|
||||||
|
|
||||||
|
This page lists all the utility functions used by the tokenizers, mainly the class
|
||||||
|
[`~tokenization_utils_base.PreTrainedTokenizerBase`] that implements the common methods between
|
||||||
|
[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] and the mixin
|
||||||
|
[`~tokenization_utils_base.SpecialTokensMixin`].
|
||||||
|
|
||||||
|
Most of those are only useful if you are studying the code of the tokenizers in the library.
|
||||||
|
|
||||||
|
## PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
|
||||||
|
- __call__
|
||||||
|
- all
|
||||||
|
|
||||||
|
## SpecialTokensMixin
|
||||||
|
|
||||||
|
[[autodoc]] tokenization_utils_base.SpecialTokensMixin
|
||||||
|
|
||||||
|
## Enums and namedtuples
|
||||||
|
|
||||||
|
[[autodoc]] tokenization_utils_base.TruncationStrategy
|
||||||
|
|
||||||
|
[[autodoc]] tokenization_utils_base.CharSpan
|
||||||
|
|
||||||
|
[[autodoc]] tokenization_utils_base.TokenSpan
|
@ -1,45 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Utilities for Tokenizers
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
This page lists all the utility functions used by the tokenizers, mainly the class
|
|
||||||
:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
|
|
||||||
:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
|
|
||||||
:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
|
|
||||||
|
|
||||||
Most of those are only useful if you are studying the code of the tokenizers in the library.
|
|
||||||
|
|
||||||
PreTrainedTokenizerBase
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
|
|
||||||
:special-members: __call__
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
SpecialTokensMixin
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
Enums and namedtuples
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.CharSpan
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.TokenSpan
|
|
43
docs/source/internal/trainer_utils.mdx
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Utilities for Trainer
|
||||||
|
|
||||||
|
This page lists all the utility functions used by [`Trainer`].
|
||||||
|
|
||||||
|
Most of those are only useful if you are studying the code of the Trainer in the library.
|
||||||
|
|
||||||
|
## Utilities
|
||||||
|
|
||||||
|
[[autodoc]] EvalPrediction
|
||||||
|
|
||||||
|
[[autodoc]] IntervalStrategy
|
||||||
|
|
||||||
|
[[autodoc]] set_seed
|
||||||
|
|
||||||
|
[[autodoc]] torch_distributed_zero_first
|
||||||
|
|
||||||
|
## Callbacks internals
|
||||||
|
|
||||||
|
[[autodoc]] trainer_callback.CallbackHandler
|
||||||
|
|
||||||
|
## Distributed Evaluation
|
||||||
|
|
||||||
|
[[autodoc]] trainer_pt_utils.DistributedTensorGatherer
|
||||||
|
|
||||||
|
## Distributed Evaluation
|
||||||
|
|
||||||
|
[[autodoc]] HfArgumentParser
|
||||||
|
|
||||||
|
## Debug Utilities
|
||||||
|
|
||||||
|
[[autodoc]] debug_utils.DebugUnderflowOverflow
|
@ -1,54 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Utilities for Trainer
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
This page lists all the utility functions used by :class:`~transformers.Trainer`.
|
|
||||||
|
|
||||||
Most of those are only useful if you are studying the code of the Trainer in the library.
|
|
||||||
|
|
||||||
Utilities
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.EvalPrediction
|
|
||||||
|
|
||||||
.. autoclass:: transformers.IntervalStrategy
|
|
||||||
|
|
||||||
.. autofunction:: transformers.set_seed
|
|
||||||
|
|
||||||
.. autofunction:: transformers.torch_distributed_zero_first
|
|
||||||
|
|
||||||
|
|
||||||
Callbacks internals
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.trainer_callback.CallbackHandler
|
|
||||||
|
|
||||||
|
|
||||||
Distributed Evaluation
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
Distributed Evaluation
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.HfArgumentParser
|
|
||||||
|
|
||||||
|
|
||||||
Debug Utilities
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.debug_utils.DebugUnderflowOverflow
|
|
111
docs/source/main_classes/callback.mdx
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Callbacks
|
||||||
|
|
||||||
|
Callbacks are objects that can customize the behavior of the training loop in the PyTorch
|
||||||
|
[`Trainer`] (this feature is not yet implemented in TensorFlow) that can inspect the training loop
|
||||||
|
state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
|
||||||
|
stopping).
|
||||||
|
|
||||||
|
Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] object they return, they
|
||||||
|
cannot change anything in the training loop. For customizations that require changes in the training loop, you should
|
||||||
|
subclass [`Trainer`] and override the methods you need (see [trainer](trainer) for examples).
|
||||||
|
|
||||||
|
By default a [`Trainer`] will use the following callbacks:
|
||||||
|
|
||||||
|
- [`DefaultFlowCallback`] which handles the default behavior for logging, saving and evaluation.
|
||||||
|
- [`PrinterCallback`] or [`ProgressCallback`] to display progress and print the
|
||||||
|
logs (the first one is used if you deactivate tqdm through the [`TrainingArguments`], otherwise
|
||||||
|
it's the second one).
|
||||||
|
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
|
||||||
|
or tensorboardX).
|
||||||
|
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
|
||||||
|
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.ml/site/) is installed.
|
||||||
|
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
|
||||||
|
- [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
|
||||||
|
installed.
|
||||||
|
- [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is
|
||||||
|
installed.
|
||||||
|
|
||||||
|
The main class that implements callbacks is [`TrainerCallback`]. It gets the
|
||||||
|
[`TrainingArguments`] used to instantiate the [`Trainer`], can access that
|
||||||
|
Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
|
||||||
|
[`TrainerControl`].
|
||||||
|
|
||||||
|
|
||||||
|
## Available Callbacks
|
||||||
|
|
||||||
|
Here is the list of the available [`TrainerCallback`] in the library:
|
||||||
|
|
||||||
|
[[autodoc]] integrations.CometCallback
|
||||||
|
- setup
|
||||||
|
|
||||||
|
[[autodoc]] DefaultFlowCallback
|
||||||
|
|
||||||
|
[[autodoc]] PrinterCallback
|
||||||
|
|
||||||
|
[[autodoc]] ProgressCallback
|
||||||
|
|
||||||
|
[[autodoc]] EarlyStoppingCallback
|
||||||
|
|
||||||
|
[[autodoc]] integrations.TensorBoardCallback
|
||||||
|
|
||||||
|
[[autodoc]] integrations.WandbCallback
|
||||||
|
- setup
|
||||||
|
|
||||||
|
[[autodoc]] integrations.MLflowCallback
|
||||||
|
- setup
|
||||||
|
|
||||||
|
[[autodoc]] integrations.AzureMLCallback
|
||||||
|
|
||||||
|
[[autodoc]] integrations.CodeCarbonCallback
|
||||||
|
|
||||||
|
## TrainerCallback
|
||||||
|
|
||||||
|
[[autodoc]] TrainerCallback
|
||||||
|
|
||||||
|
Here is an example of how to register a custom callback with the PyTorch [`Trainer`]:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MyCallback(TrainerCallback):
|
||||||
|
"A callback that prints a message at the beginning of training"
|
||||||
|
|
||||||
|
def on_train_begin(self, args, state, control, **kwargs):
|
||||||
|
print("Starting training")
|
||||||
|
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model,
|
||||||
|
args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset,
|
||||||
|
callbacks=[MyCallback], # We can either pass the callback class this way or an instance of it (MyCallback())
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Another way to register a callback is to call `trainer.add_callback()` as follows:
|
||||||
|
|
||||||
|
```python
|
||||||
|
trainer = Trainer(...)
|
||||||
|
trainer.add_callback(MyCallback)
|
||||||
|
# Alternatively, we can pass an instance of the callback class
|
||||||
|
trainer.add_callback(MyCallback())
|
||||||
|
```
|
||||||
|
|
||||||
|
## TrainerState
|
||||||
|
|
||||||
|
[[autodoc]] TrainerState
|
||||||
|
|
||||||
|
## TrainerControl
|
||||||
|
|
||||||
|
[[autodoc]] TrainerControl
|
@ -1,115 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Callbacks
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Callbacks are objects that can customize the behavior of the training loop in the PyTorch
|
|
||||||
:class:`~transformers.Trainer` (this feature is not yet implemented in TensorFlow) that can inspect the training loop
|
|
||||||
state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
|
|
||||||
stopping).
|
|
||||||
|
|
||||||
Callbacks are "read only" pieces of code, apart from the :class:`~transformers.TrainerControl` object they return, they
|
|
||||||
cannot change anything in the training loop. For customizations that require changes in the training loop, you should
|
|
||||||
subclass :class:`~transformers.Trainer` and override the methods you need (see :doc:`trainer` for examples).
|
|
||||||
|
|
||||||
By default a :class:`~transformers.Trainer` will use the following callbacks:
|
|
||||||
|
|
||||||
- :class:`~transformers.DefaultFlowCallback` which handles the default behavior for logging, saving and evaluation.
|
|
||||||
- :class:`~transformers.PrinterCallback` or :class:`~transformers.ProgressCallback` to display progress and print the
|
|
||||||
logs (the first one is used if you deactivate tqdm through the :class:`~transformers.TrainingArguments`, otherwise
|
|
||||||
it's the second one).
|
|
||||||
- :class:`~transformers.integrations.TensorBoardCallback` if tensorboard is accessible (either through PyTorch >= 1.4
|
|
||||||
or tensorboardX).
|
|
||||||
- :class:`~transformers.integrations.WandbCallback` if `wandb <https://www.wandb.com/>`__ is installed.
|
|
||||||
- :class:`~transformers.integrations.CometCallback` if `comet_ml <https://www.comet.ml/site/>`__ is installed.
|
|
||||||
- :class:`~transformers.integrations.MLflowCallback` if `mlflow <https://www.mlflow.org/>`__ is installed.
|
|
||||||
- :class:`~transformers.integrations.AzureMLCallback` if `azureml-sdk <https://pypi.org/project/azureml-sdk/>`__ is
|
|
||||||
installed.
|
|
||||||
|
|
||||||
The main class that implements callbacks is :class:`~transformers.TrainerCallback`. It gets the
|
|
||||||
:class:`~transformers.TrainingArguments` used to instantiate the :class:`~transformers.Trainer`, can access that
|
|
||||||
Trainer's internal state via :class:`~transformers.TrainerState`, and can take some actions on the training loop via
|
|
||||||
:class:`~transformers.TrainerControl`.
|
|
||||||
|
|
||||||
|
|
||||||
Available Callbacks
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Here is the list of the available :class:`~transformers.TrainerCallback` in the library:
|
|
||||||
|
|
||||||
.. autoclass:: transformers.integrations.CometCallback
|
|
||||||
:members: setup
|
|
||||||
|
|
||||||
.. autoclass:: transformers.DefaultFlowCallback
|
|
||||||
|
|
||||||
.. autoclass:: transformers.PrinterCallback
|
|
||||||
|
|
||||||
.. autoclass:: transformers.ProgressCallback
|
|
||||||
|
|
||||||
.. autoclass:: transformers.EarlyStoppingCallback
|
|
||||||
|
|
||||||
.. autoclass:: transformers.integrations.TensorBoardCallback
|
|
||||||
|
|
||||||
.. autoclass:: transformers.integrations.WandbCallback
|
|
||||||
:members: setup
|
|
||||||
|
|
||||||
.. autoclass:: transformers.integrations.MLflowCallback
|
|
||||||
:members: setup
|
|
||||||
|
|
||||||
.. autoclass:: transformers.integrations.AzureMLCallback
|
|
||||||
|
|
||||||
TrainerCallback
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.TrainerCallback
|
|
||||||
:members:
|
|
||||||
|
|
||||||
Here is an example of how to register a custom callback with the PyTorch :class:`~transformers.Trainer`:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
class MyCallback(TrainerCallback):
|
|
||||||
"A callback that prints a message at the beginning of training"
|
|
||||||
|
|
||||||
def on_train_begin(self, args, state, control, **kwargs):
|
|
||||||
print("Starting training")
|
|
||||||
|
|
||||||
trainer = Trainer(
|
|
||||||
model,
|
|
||||||
args,
|
|
||||||
train_dataset=train_dataset,
|
|
||||||
eval_dataset=eval_dataset,
|
|
||||||
callbacks=[MyCallback] # We can either pass the callback class this way or an instance of it (MyCallback())
|
|
||||||
)
|
|
||||||
|
|
||||||
Another way to register a callback is to call ``trainer.add_callback()`` as follows:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
trainer = Trainer(...)
|
|
||||||
trainer.add_callback(MyCallback)
|
|
||||||
# Alternatively, we can pass an instance of the callback class
|
|
||||||
trainer.add_callback(MyCallback())
|
|
||||||
|
|
||||||
TrainerState
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.TrainerState
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
TrainerControl
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.TrainerControl
|
|
||||||
:members:
|
|
28
docs/source/main_classes/configuration.mdx
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
|
||||||
|
The base class [`PretrainedConfig`] implements the common methods for loading/saving a configuration
|
||||||
|
either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
|
||||||
|
from HuggingFace's AWS S3 repository).
|
||||||
|
|
||||||
|
Each derived config class implements model specific attributes. Common attributes present in all config classes are:
|
||||||
|
`hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
|
||||||
|
`vocab_size`.
|
||||||
|
|
||||||
|
|
||||||
|
## PretrainedConfig
|
||||||
|
|
||||||
|
[[autodoc]] PretrainedConfig
|
||||||
|
- push_to_hub
|
||||||
|
- all
|
@ -1,31 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Configuration
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
|
|
||||||
either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
|
|
||||||
from HuggingFace's AWS S3 repository).
|
|
||||||
|
|
||||||
Each derived config class implements model specific attributes. Common attributes present in all config classes are:
|
|
||||||
:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
|
|
||||||
:obj:`vocab_size`.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PretrainedConfig
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.PretrainedConfig
|
|
||||||
:special-members: push_to_hub
|
|
||||||
:members:
|
|
64
docs/source/main_classes/data_collator.mdx
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Data Collator
|
||||||
|
|
||||||
|
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
|
||||||
|
the same type as the elements of `train_dataset` or `eval_dataset`.
|
||||||
|
|
||||||
|
To be able to build batches, data collators may apply some processing (like padding). Some of them (like
|
||||||
|
[`DataCollatorForLanguageModeling`]) also apply some random data augmentation (like random masking)
|
||||||
|
on the formed batch.
|
||||||
|
|
||||||
|
Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
|
||||||
|
|
||||||
|
|
||||||
|
## Default data collator
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.default_data_collator
|
||||||
|
|
||||||
|
## DefaultDataCollator
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.DefaultDataCollator
|
||||||
|
|
||||||
|
## DataCollatorWithPadding
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.DataCollatorWithPadding
|
||||||
|
|
||||||
|
## DataCollatorForTokenClassification
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.DataCollatorForTokenClassification
|
||||||
|
|
||||||
|
## DataCollatorForSeq2Seq
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
|
||||||
|
|
||||||
|
## DataCollatorForLanguageModeling
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
|
||||||
|
- numpy_mask_tokens
|
||||||
|
- tf_mask_tokens
|
||||||
|
- torch_mask_tokens
|
||||||
|
|
||||||
|
## DataCollatorForWholeWordMask
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
|
||||||
|
- numpy_mask_tokens
|
||||||
|
- tf_mask_tokens
|
||||||
|
- torch_mask_tokens
|
||||||
|
|
||||||
|
## DataCollatorForPermutationLanguageModeling
|
||||||
|
|
||||||
|
[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
|
||||||
|
- numpy_mask_tokens
|
||||||
|
- tf_mask_tokens
|
||||||
|
- torch_mask_tokens
|
@ -1,78 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
Data Collator
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
|
|
||||||
the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.
|
|
||||||
|
|
||||||
To be able to build batches, data collators may apply some processing (like padding). Some of them (like
|
|
||||||
:class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
|
|
||||||
on the formed batch.
|
|
||||||
|
|
||||||
Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.
|
|
||||||
|
|
||||||
|
|
||||||
Default data collator
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autofunction:: transformers.data.data_collator.default_data_collator
|
|
||||||
|
|
||||||
|
|
||||||
DefaultDataCollator
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.data.data_collator.DefaultDataCollator
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
DataCollatorWithPadding
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.data.data_collator.DataCollatorWithPadding
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
DataCollatorForTokenClassification
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.data.data_collator.DataCollatorForTokenClassification
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
DataCollatorForSeq2Seq
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.data.data_collator.DataCollatorForSeq2Seq
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
DataCollatorForLanguageModeling
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
|
|
||||||
:members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
|
|
||||||
|
|
||||||
|
|
||||||
DataCollatorForWholeWordMask
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
|
|
||||||
:members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
|
|
||||||
|
|
||||||
|
|
||||||
DataCollatorForPermutationLanguageModeling
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
|
|
||||||
:members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
|
|
2039
docs/source/main_classes/deepspeed.mdx
Normal file
38
docs/source/main_classes/feature_extractor.mdx
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Feature Extractor
|
||||||
|
|
||||||
|
A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
|
||||||
|
from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
|
||||||
|
*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
|
||||||
|
tensors.
|
||||||
|
|
||||||
|
|
||||||
|
## FeatureExtractionMixin
|
||||||
|
|
||||||
|
[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
|
||||||
|
- from_pretrained
|
||||||
|
- save_pretrained
|
||||||
|
|
||||||
|
## SequenceFeatureExtractor
|
||||||
|
|
||||||
|
[[autodoc]] SequenceFeatureExtractor
|
||||||
|
- pad
|
||||||
|
|
||||||
|
## BatchFeature
|
||||||
|
|
||||||
|
[[autodoc]] BatchFeature
|
||||||
|
|
||||||
|
## ImageFeatureExtractionMixin
|
||||||
|
|
||||||
|
[[autodoc]] image_utils.ImageFeatureExtractionMixin
|
@ -1,48 +0,0 @@
|
|||||||
..
|
|
||||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
|
||||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
||||||
specific language governing permissions and limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
Feature Extractor
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
|
|
||||||
from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
|
|
||||||
*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
|
|
||||||
tensors.
|
|
||||||
|
|
||||||
|
|
||||||
FeatureExtractionMixin
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.feature_extraction_utils.FeatureExtractionMixin
|
|
||||||
:members: from_pretrained, save_pretrained
|
|
||||||
|
|
||||||
|
|
||||||
SequenceFeatureExtractor
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.SequenceFeatureExtractor
|
|
||||||
:members: pad
|
|
||||||
|
|
||||||
|
|
||||||
BatchFeature
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.BatchFeature
|
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
ImageFeatureExtractionMixin
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. autoclass:: transformers.image_utils.ImageFeatureExtractionMixin
|
|
||||||
:members:
|
|
24
docs/source/main_classes/keras_callbacks.mdx
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Keras callbacks
|
||||||
|
|
||||||
|
When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
|
||||||
|
tasks:
|
||||||
|
|
||||||
|
## KerasMetricCallback
|
||||||
|
|
||||||
|
[[autodoc]] KerasMetricCallback
|
||||||
|
|
||||||
|
## PushToHubCallback
|
||||||
|
|
||||||
|
[[autodoc]] PushToHubCallback
|