mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 17:48:57 +08:00
Compare commits
21 Commits
check_torc
...
v4.0.0
Author | SHA1 | Date | |
---|---|---|---|
c781171dfa | |||
ab597c84d1 | |||
e72b4fafeb | |||
dc0dea3e42 | |||
4d8f5d12b3 | |||
710b0108c9 | |||
87199dee00 | |||
68879472c4 | |||
8c5a2b8e36 | |||
911d8486e8 | |||
563efd36ab | |||
5a63232a8a | |||
e46890f699 | |||
df2cdd84f3 | |||
c6e2876cd4 | |||
5580cccd81 | |||
ccc4f64044 | |||
3408e6ffcd | |||
a986b02e49 | |||
b6ec39e41f | |||
f80ea27f80 |
@ -77,7 +77,7 @@ jobs:
|
|||||||
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,tf-cpu,torch,testing]
|
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-{{ checksum "setup.py" }}
|
key: v0.4-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
@ -103,7 +103,7 @@ jobs:
|
|||||||
- v0.4-torch-{{ checksum "setup.py" }}
|
- v0.4-torch-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,testing]
|
- run: pip install .[sklearn,torch,testing,sentencepiece]
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-torch-{{ checksum "setup.py" }}
|
key: v0.4-torch-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
@ -129,7 +129,7 @@ jobs:
|
|||||||
- v0.4-tf-{{ checksum "setup.py" }}
|
- v0.4-tf-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,tf-cpu,testing]
|
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-tf-{{ checksum "setup.py" }}
|
key: v0.4-tf-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
@ -155,7 +155,7 @@ jobs:
|
|||||||
- v0.4-flax-{{ checksum "setup.py" }}
|
- v0.4-flax-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: sudo pip install .[flax,sklearn,torch,testing]
|
- run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-flax-{{ checksum "setup.py" }}
|
key: v0.4-flax-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
@ -181,7 +181,7 @@ jobs:
|
|||||||
- v0.4-torch-{{ checksum "setup.py" }}
|
- v0.4-torch-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,testing]
|
- run: pip install .[sklearn,torch,testing,sentencepiece]
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-torch-{{ checksum "setup.py" }}
|
key: v0.4-torch-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
@ -207,7 +207,7 @@ jobs:
|
|||||||
- v0.4-tf-{{ checksum "setup.py" }}
|
- v0.4-tf-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,tf-cpu,testing]
|
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-tf-{{ checksum "setup.py" }}
|
key: v0.4-tf-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
@ -231,7 +231,7 @@ jobs:
|
|||||||
- v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
- v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[ja,testing]
|
- run: pip install .[ja,testing,sentencepiece]
|
||||||
- run: python -m unidic download
|
- run: python -m unidic download
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||||
@ -258,7 +258,7 @@ jobs:
|
|||||||
- v0.4-torch_examples-{{ checksum "setup.py" }}
|
- v0.4-torch_examples-{{ checksum "setup.py" }}
|
||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install .[sklearn,torch,testing]
|
- run: pip install .[sklearn,torch,sentencepiece,testing]
|
||||||
- run: pip install -r examples/requirements.txt
|
- run: pip install -r examples/requirements.txt
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-torch_examples-{{ checksum "setup.py" }}
|
key: v0.4-torch_examples-{{ checksum "setup.py" }}
|
||||||
@ -324,7 +324,7 @@ jobs:
|
|||||||
- v0.4-{{ checksum "setup.py" }}
|
- v0.4-{{ checksum "setup.py" }}
|
||||||
- run: pip install --upgrade pip
|
- run: pip install --upgrade pip
|
||||||
- run: pip install isort
|
- run: pip install isort
|
||||||
- run: pip install .[tf,torch,flax,quality]
|
- run: pip install .[all,quality]
|
||||||
- save_cache:
|
- save_cache:
|
||||||
key: v0.4-code_quality-{{ checksum "setup.py" }}
|
key: v0.4-code_quality-{{ checksum "setup.py" }}
|
||||||
paths:
|
paths:
|
||||||
|
1
.github/workflows/self-scheduled.yml
vendored
1
.github/workflows/self-scheduled.yml
vendored
@ -9,7 +9,6 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- ci_*
|
- ci_*
|
||||||
- framework-agnostic-tokenizers
|
|
||||||
repository_dispatch:
|
repository_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "0 0 * * *"
|
- cron: "0 0 * * *"
|
||||||
|
@ -197,6 +197,8 @@ ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/
|
|||||||
1. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
|
1. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
|
||||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||||
|
|
||||||
|
To cehck if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
|
||||||
|
|
||||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,6 +2,15 @@
|
|||||||
|
|
||||||
/* Colab dropdown */
|
/* Colab dropdown */
|
||||||
|
|
||||||
|
table.center-aligned-table td {
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
table.center-aligned-table th {
|
||||||
|
text-align: center;
|
||||||
|
vertical-align: middle;
|
||||||
|
}
|
||||||
|
|
||||||
.colab-dropdown {
|
.colab-dropdown {
|
||||||
position: relative;
|
position: relative;
|
||||||
display: inline-block;
|
display: inline-block;
|
||||||
|
@ -26,7 +26,7 @@ author = u'huggingface'
|
|||||||
# The short X.Y version
|
# The short X.Y version
|
||||||
version = u''
|
version = u''
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = u'3.5.0'
|
release = u'4.0.0'
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
@ -35,6 +35,8 @@ Choose the right framework for every part of a model's lifetime:
|
|||||||
- Move a single model between TF2.0/PyTorch frameworks at will
|
- Move a single model between TF2.0/PyTorch frameworks at will
|
||||||
- Seamlessly pick the right framework for training, evaluation, production
|
- Seamlessly pick the right framework for training, evaluation, production
|
||||||
|
|
||||||
|
Experimental support for Flax with a few models right now, expected to grow in the coming months.
|
||||||
|
|
||||||
Contents
|
Contents
|
||||||
-----------------------------------------------------------------------------------------------------------------------
|
-----------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
@ -44,7 +46,7 @@ The documentation is organized in five parts:
|
|||||||
and a glossary.
|
and a glossary.
|
||||||
- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
|
- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
|
||||||
- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
|
- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
|
||||||
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
|
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in
|
||||||
transformers model
|
transformers model
|
||||||
- The three last section contain the documentation of each public class and function, grouped in:
|
- The three last section contain the documentation of each public class and function, grouped in:
|
||||||
|
|
||||||
@ -52,8 +54,8 @@ The documentation is organized in five parts:
|
|||||||
- **MODELS** for the classes and functions related to each model implemented in the library.
|
- **MODELS** for the classes and functions related to each model implemented in the library.
|
||||||
- **INTERNAL HELPERS** for the classes and functions we use internally.
|
- **INTERNAL HELPERS** for the classes and functions we use internally.
|
||||||
|
|
||||||
The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
|
The library currently contains PyTorch, Tensorflow and Flax implementations, pretrained model weights, usage scripts
|
||||||
conversion utilities for the following models:
|
and conversion utilities for the following models:
|
||||||
|
|
||||||
..
|
..
|
||||||
This list is updated automatically from the README with `make fix-copies`. Do not update manually!
|
This list is updated automatically from the README with `make fix-copies`. Do not update manually!
|
||||||
@ -166,6 +168,97 @@ conversion utilities for the following models:
|
|||||||
34. `Other community models <https://huggingface.co/models>`__, contributed by the `community
|
34. `Other community models <https://huggingface.co/models>`__, contributed by the `community
|
||||||
<https://huggingface.co/users>`__.
|
<https://huggingface.co/users>`__.
|
||||||
|
|
||||||
|
|
||||||
|
.. _bigtable:
|
||||||
|
|
||||||
|
The table below represents the current support in the library for each of those models, whether they have a Python
|
||||||
|
tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in PyTorch,
|
||||||
|
TensorFlow and/or Flax.
|
||||||
|
|
||||||
|
..
|
||||||
|
This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
|
||||||
|
|
||||||
|
.. rst-class:: center-aligned-table
|
||||||
|
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
|
||||||
|
+=============================+================+================+=================+====================+==============+
|
||||||
|
| ALBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| BART | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| BERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Blenderbot | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| DPR | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| DeBERTa | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| DistilBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| ELECTRA | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| LayoutLM | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Marian | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Pegasus | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| RAG | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| T5 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| mBART | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
| mT5 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
|
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
:caption: Get started
|
:caption: Get started
|
||||||
|
@ -70,15 +70,15 @@ to check 🤗 Transformers is properly installed.
|
|||||||
|
|
||||||
This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
|
This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
|
||||||
`cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
|
`cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
|
||||||
folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the PyTorch
|
folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
|
||||||
cache home followed by ``/transformers/`` (even if you don't have PyTorch installed). This is (by order of priority):
|
Face cache home followed by ``/transformers/``. This is (by order of priority):
|
||||||
|
|
||||||
* shell environment variable ``TORCH_HOME``
|
* shell environment variable ``HF_HOME``
|
||||||
* shell environment variable ``XDG_CACHE_HOME`` + ``/torch/``
|
* shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
|
||||||
* default: ``~/.cache/torch/``
|
* default: ``~/.cache/huggingface/``
|
||||||
|
|
||||||
So if you don't have any specific environment variable set, the cache directory will be at
|
So if you don't have any specific environment variable set, the cache directory will be at
|
||||||
``~/.cache/torch/transformers/``.
|
``~/.cache/huggingface/transformers/``.
|
||||||
|
|
||||||
**Note:** If you have set a shell environment variable for one of the predecessors of this library
|
**Note:** If you have set a shell environment variable for one of the predecessors of this library
|
||||||
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
|
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
|
||||||
@ -97,6 +97,6 @@ You should check out our [swift-coreml-transformers](https://github.com/huggingf
|
|||||||
It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
|
It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
|
||||||
`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
|
`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
|
||||||
|
|
||||||
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch or
|
At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
|
||||||
TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
|
TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
|
||||||
hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
|
hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
|
||||||
|
@ -1,5 +1,170 @@
|
|||||||
# Migrating from previous packages
|
# Migrating from previous packages
|
||||||
|
|
||||||
|
## Migrating from transformers `v3.x` to `v4.x`
|
||||||
|
|
||||||
|
A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
|
||||||
|
expected changes:
|
||||||
|
|
||||||
|
#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
|
||||||
|
|
||||||
|
The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set.
|
||||||
|
|
||||||
|
This introduces two breaking changes:
|
||||||
|
- The handling of overflowing tokens between the python and rust tokenizers is different.
|
||||||
|
- The rust tokenizers do not accept integers in the encoding methods.
|
||||||
|
|
||||||
|
##### How to obtain the same behavior as v3.x in v4.x
|
||||||
|
|
||||||
|
- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=textclassification#tokenclassificationpipeline).
|
||||||
|
- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
|
||||||
|
|
||||||
|
In version `v3.x`:
|
||||||
|
```py
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||||
|
```
|
||||||
|
to obtain the same in version `v4.x`:
|
||||||
|
```py
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. SentencePiece is removed from the required dependencies
|
||||||
|
|
||||||
|
The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
|
||||||
|
|
||||||
|
This includes the **slow** versions of:
|
||||||
|
- `XLNetTokenizer`
|
||||||
|
- `AlbertTokenizer`
|
||||||
|
- `CamembertTokenizer`
|
||||||
|
- `MBartTokenizer`
|
||||||
|
- `PegasusTokenizer`
|
||||||
|
- `T5Tokenizer`
|
||||||
|
- `ReformerTokenizer`
|
||||||
|
- `XLMRobertaTokenizer`
|
||||||
|
|
||||||
|
##### How to obtain the same behavior as v3.x in v4.x
|
||||||
|
|
||||||
|
In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
|
||||||
|
|
||||||
|
In version `v3.x`:
|
||||||
|
```bash
|
||||||
|
pip install transformers
|
||||||
|
```
|
||||||
|
to obtain the same in version `v4.x`:
|
||||||
|
```bash
|
||||||
|
pip install transformers[sentencepiece]
|
||||||
|
```
|
||||||
|
or
|
||||||
|
```bash
|
||||||
|
pip install transformers sentencepiece
|
||||||
|
```
|
||||||
|
#### 3. The architecture of the repo has been updated so that each model resides in its folder
|
||||||
|
|
||||||
|
The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
|
||||||
|
|
||||||
|
This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
|
||||||
|
|
||||||
|
##### How to obtain the same behavior as v3.x in v4.x
|
||||||
|
|
||||||
|
In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers.
|
||||||
|
|
||||||
|
In version `v3.x`:
|
||||||
|
```bash
|
||||||
|
from transformers.modeling_bert import BertLayer
|
||||||
|
```
|
||||||
|
to obtain the same in version `v4.x`:
|
||||||
|
```bash
|
||||||
|
from transformers.models.bert.modeling_bert import BertLayer
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 4. Switching the `return_dict` argument to `True` by default
|
||||||
|
|
||||||
|
The [`return_dict` argument](https://huggingface.co/transformers/main_classes/output.html) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
|
||||||
|
|
||||||
|
This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
|
||||||
|
|
||||||
|
##### How to obtain the same behavior as v3.x in v4.x
|
||||||
|
|
||||||
|
In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
|
||||||
|
|
||||||
|
In version `v3.x`:
|
||||||
|
```bash
|
||||||
|
model = BertModel.from_pretrained("bert-base-cased")
|
||||||
|
outputs = model(**inputs)
|
||||||
|
```
|
||||||
|
to obtain the same in version `v4.x`:
|
||||||
|
```bash
|
||||||
|
model = BertModel.from_pretrained("bert-base-cased")
|
||||||
|
outputs = model(**inputs, return_dict=False)
|
||||||
|
```
|
||||||
|
or
|
||||||
|
```bash
|
||||||
|
model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
|
||||||
|
outputs = model(**inputs)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 5. Removed some deprecated attributes
|
||||||
|
|
||||||
|
Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
|
||||||
|
|
||||||
|
Here is a list of these attributes/methods/arguments and what their replacements should be:
|
||||||
|
|
||||||
|
In several models, the labels become consistent with the other models:
|
||||||
|
- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
|
||||||
|
- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
|
||||||
|
- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
|
||||||
|
- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
|
||||||
|
- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
|
||||||
|
- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
|
||||||
|
- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
|
||||||
|
- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
|
||||||
|
- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
|
||||||
|
- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
|
||||||
|
- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
|
||||||
|
|
||||||
|
In several models, the caching mechanism becomes consistent with the other models:
|
||||||
|
- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
|
||||||
|
- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
|
||||||
|
- `past` becomes `past_key_values` in all CTRL models.
|
||||||
|
- `past` becomes `past_key_values` in all GPT-2 models.
|
||||||
|
|
||||||
|
Regarding the tokenizer classes:
|
||||||
|
- The tokenizer attribute `max_len` becomes `model_max_length`.
|
||||||
|
- The tokenizer attribute `return_lengths` becomes `return_length`.
|
||||||
|
- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
|
||||||
|
|
||||||
|
Regarding the `Trainer` class:
|
||||||
|
- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
|
||||||
|
- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
|
||||||
|
- The `Trainer` attribute `data_collator` should be a callable.
|
||||||
|
- The `Trainer` method `_log` is deprecated in favor of `log`.
|
||||||
|
- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
|
||||||
|
- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
|
||||||
|
- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
|
||||||
|
- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
|
||||||
|
|
||||||
|
Regarding the `TFTrainer` class:
|
||||||
|
- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
|
||||||
|
- The `Trainer` method `_log` is deprecated in favor of `log`.
|
||||||
|
- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
|
||||||
|
- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
|
||||||
|
- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
|
||||||
|
|
||||||
|
Regarding the `TrainerArgument` class:
|
||||||
|
- The `TrainerArgument` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
|
||||||
|
|
||||||
|
Regarding the Transfo-XL model:
|
||||||
|
- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
|
||||||
|
- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
|
||||||
|
|
||||||
|
Regarding pipelines:
|
||||||
|
- The `FillMaskPipeline` argument `topk` becomes `top_k`.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Migrating from pytorch-transformers to 🤗 Transformers
|
## Migrating from pytorch-transformers to 🤗 Transformers
|
||||||
|
|
||||||
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
|
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
|
||||||
|
@ -10,7 +10,7 @@ Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Ali
|
|||||||
|
|
||||||
The abstract from the paper is the following:
|
The abstract from the paper is the following:
|
||||||
|
|
||||||
*Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By
|
*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
|
||||||
warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
|
warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
|
||||||
benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
|
benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
|
||||||
Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
|
Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
|
||||||
|
@ -20,8 +20,8 @@ disentangled attention mechanism, where each word is represented using two vecto
|
|||||||
position, respectively, and the attention weights among words are computed using disentangled matrices on their
|
position, respectively, and the attention weights among words are computed using disentangled matrices on their
|
||||||
contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
|
contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
|
||||||
predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
|
predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
|
||||||
of model pre-training and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half
|
of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
|
||||||
of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
|
the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
|
||||||
(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
|
(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
|
||||||
pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
|
pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
|
||||||
|
|
||||||
|
@ -18,9 +18,9 @@ operating these large models in on-the-edge and/or under constrained computation
|
|||||||
remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
|
remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
|
||||||
model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
|
model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
|
||||||
counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
|
counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
|
||||||
knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by
|
knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
|
||||||
40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
|
40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
|
||||||
biases learned by larger models during pre-training, we introduce a triple loss combining language modeling,
|
biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
|
||||||
distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
|
distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
|
||||||
demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
|
demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
|
||||||
study.*
|
study.*
|
||||||
|
@ -5,7 +5,7 @@ Overview
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
|
Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
|
||||||
intorduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
|
introduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
|
||||||
Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
|
Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
|
||||||
|
|
||||||
The abstract from the paper is the following:
|
The abstract from the paper is the following:
|
||||||
|
@ -12,14 +12,14 @@ identify which tokens were replaced by the generator in the sequence.
|
|||||||
|
|
||||||
The abstract from the paper is the following:
|
The abstract from the paper is the following:
|
||||||
|
|
||||||
*Masked language modeling (MLM) pre-training methods such as BERT corrupt the input by replacing some tokens with
|
*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
|
||||||
[MASK] and then train a model to reconstruct the original tokens. While they produce good results when transferred to
|
and then train a model to reconstruct the original tokens. While they produce good results when transferred to
|
||||||
downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
|
downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
|
||||||
more sample-efficient pre-training task called replaced token detection. Instead of masking the input, our approach
|
more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
|
||||||
corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
|
corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
|
||||||
of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
|
of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
|
||||||
predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
|
predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
|
||||||
demonstrate this new pre-training task is more efficient than MLM because the task is defined over all input tokens
|
demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
|
||||||
rather than just the small subset that was masked out. As a result, the contextual representations learned by our
|
rather than just the small subset that was masked out. As a result, the contextual representations learned by our
|
||||||
approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
|
approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
|
||||||
particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
|
particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
|
||||||
|
@ -19,7 +19,7 @@ representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018;
|
|||||||
heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
|
heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
|
||||||
Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
|
Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
|
||||||
classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
|
classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
|
||||||
time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified evaluation
|
time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
|
||||||
protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
|
protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
|
||||||
community for further reproducible experiments in French NLP.*
|
community for further reproducible experiments in French NLP.*
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ The abstract from the paper is the following:
|
|||||||
*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
|
*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
|
||||||
semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
|
semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
|
||||||
labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
|
labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
|
||||||
perform adequately. We demonstrate that large gains on these tasks can be realized by generative pre-training of a
|
perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
|
||||||
language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
|
language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
|
||||||
contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
|
contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
|
||||||
effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
|
effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
|
||||||
|
@ -6,19 +6,19 @@ Overview
|
|||||||
|
|
||||||
The LayoutLM model was proposed in the paper `LayoutLM: Pre-training of Text and Layout for Document Image
|
The LayoutLM model was proposed in the paper `LayoutLM: Pre-training of Text and Layout for Document Image
|
||||||
Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
|
Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
|
||||||
Ming Zhou. It's a simple but effective pre-training method of text and layout for document image understanding and
|
Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
|
||||||
information extraction tasks, such as form understanding and receipt understanding.
|
information extraction tasks, such as form understanding and receipt understanding.
|
||||||
|
|
||||||
The abstract from the paper is the following:
|
The abstract from the paper is the following:
|
||||||
|
|
||||||
*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
|
*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
|
||||||
widespread use of pre-training models for NLP applications, they almost exclusively focus on text-level manipulation,
|
widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
|
||||||
while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
|
while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
|
||||||
the \textbf{LayoutLM} to jointly model interactions between text and layout information across scanned document images,
|
the \textbf{LayoutLM} to jointly model interactions between text and layout information across scanned document images,
|
||||||
which is beneficial for a great number of real-world document image understanding tasks such as information extraction
|
which is beneficial for a great number of real-world document image understanding tasks such as information extraction
|
||||||
from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into
|
from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into
|
||||||
LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single
|
LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single
|
||||||
framework for document-level pre-training. It achieves new state-of-the-art results in several downstream tasks,
|
framework for document-level pretraining. It achieves new state-of-the-art results in several downstream tasks,
|
||||||
including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image
|
including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image
|
||||||
classification (from 93.07 to 94.42).*
|
classification (from 93.07 to 94.42).*
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ Encoder Representations from Transformers) framework to learn these vision-and-l
|
|||||||
build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
|
build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
|
||||||
encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
|
encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
|
||||||
semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
|
semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
|
||||||
pre-training tasks: masked language modeling, masked object prediction (feature regression and label classification),
|
pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
|
||||||
cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
|
cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
|
||||||
cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
|
cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
|
||||||
results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
|
results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
|
||||||
|
@ -13,7 +13,7 @@ The MBart model was presented in `Multilingual Denoising Pre-training for Neural
|
|||||||
Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||||
|
|
||||||
According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
|
According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
|
||||||
corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete
|
corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
|
||||||
sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
|
sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
|
||||||
on the encoder, decoder, or reconstructing parts of the text.
|
on the encoder, decoder, or reconstructing parts of the text.
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ the next token.
|
|||||||
|
|
||||||
The abstract from the paper is the following:
|
The abstract from the paper is the following:
|
||||||
|
|
||||||
*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
|
*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
|
||||||
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
|
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
|
||||||
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
|
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
|
||||||
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
|
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
|
||||||
@ -25,7 +25,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
|
|||||||
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
|
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
|
||||||
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
|
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
|
||||||
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
|
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
|
||||||
state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
|
state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
|
||||||
|
|
||||||
The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
|
The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ The abstract from the paper is the following:
|
|||||||
task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
|
task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
|
||||||
has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
|
has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
|
||||||
transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
|
transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
|
||||||
text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer
|
text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
|
||||||
approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
|
approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
|
||||||
with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
|
with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
|
||||||
summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
|
summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
|
||||||
|
@ -19,7 +19,7 @@ just the next token. Its architecture is identical to ProhpetNet, but the model
|
|||||||
|
|
||||||
The abstract from the paper is the following:
|
The abstract from the paper is the following:
|
||||||
|
|
||||||
*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
|
*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
|
||||||
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
|
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
|
||||||
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
|
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
|
||||||
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
|
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
|
||||||
@ -27,7 +27,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
|
|||||||
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
|
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
|
||||||
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
|
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
|
||||||
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
|
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
|
||||||
state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
|
state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
|
||||||
|
|
||||||
The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
|
The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
|
||||||
|
|
||||||
|
@ -527,10 +527,10 @@ Pegasus
|
|||||||
<https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
|
<https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
|
||||||
|
|
||||||
Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
|
Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
|
||||||
two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pre-training
|
two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
|
||||||
objective, called Gap Sentence Generation (GSG).
|
objective, called Gap Sentence Generation (GSG).
|
||||||
|
|
||||||
* MLM: encoder input tokens are randomely replaced by a mask tokens and have to be predicted by the encoder (like in
|
* MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
|
||||||
BERT)
|
BERT)
|
||||||
* GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
|
* GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
|
||||||
causal mask to hide the future words like a regular auto-regressive transformer decoder.
|
causal mask to hide the future words like a regular auto-regressive transformer decoder.
|
||||||
@ -609,7 +609,7 @@ MT5
|
|||||||
`mT5: A massively multilingual pre-trained text-to-text transformer <https://arxiv.org/abs/2010.11934>`_, Linting Xue
|
`mT5: A massively multilingual pre-trained text-to-text transformer <https://arxiv.org/abs/2010.11934>`_, Linting Xue
|
||||||
et al.
|
et al.
|
||||||
|
|
||||||
The model architecture is same as T5. mT5's pre-training objective includes T5's self-supervised training, but not T5's
|
The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
|
||||||
supervised training. mT5 is trained on 101 languages.
|
supervised training. mT5 is trained on 101 languages.
|
||||||
|
|
||||||
The library provides a version of this model for conditional generation.
|
The library provides a version of this model for conditional generation.
|
||||||
@ -630,8 +630,8 @@ MBart
|
|||||||
`Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu,
|
`Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu,
|
||||||
Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||||
|
|
||||||
The model architecture and pre-training objective is same as BART, but MBart is trained on 25 languages and is intended
|
The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
|
||||||
for supervised and unsupervised machine translation. MBart is one of the first methods for pre-training a complete
|
for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
|
||||||
sequence-to-sequence model by denoising full texts in multiple languages,
|
sequence-to-sequence model by denoising full texts in multiple languages,
|
||||||
|
|
||||||
The library provides a version of this model for conditional generation.
|
The library provides a version of this model for conditional generation.
|
||||||
@ -658,7 +658,7 @@ ProphetNet
|
|||||||
`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
|
`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
|
||||||
Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
|
Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
|
||||||
|
|
||||||
ProphetNet introduces a novel *sequence-to-sequence* pre-training objective, called *future n-gram prediction*. In
|
ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
|
||||||
future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
|
future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
|
||||||
time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
|
time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
|
||||||
to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
|
to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
|
||||||
@ -683,8 +683,8 @@ XLM-ProphetNet
|
|||||||
`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
|
`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
|
||||||
Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
|
Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
|
||||||
|
|
||||||
XLM-ProphetNet's model architecture and pre-training objective is same as ProphetNet, but XLM-ProphetNet was
|
XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
|
||||||
pre-trained on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
|
on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
|
||||||
|
|
||||||
The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
|
The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
|
||||||
versions for headline generation and question generation, respectively.
|
versions for headline generation and question generation, respectively.
|
||||||
|
@ -109,7 +109,7 @@ XLM-RoBERTa
|
|||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong gains
|
XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong gains
|
||||||
over previously released multi-lingual models like mBERT or XLM on downstream taks like classification, sequence
|
over previously released multi-lingual models like mBERT or XLM on downstream tasks like classification, sequence
|
||||||
labeling and question answering.
|
labeling and question answering.
|
||||||
|
|
||||||
Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:
|
Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:
|
||||||
|
@ -62,7 +62,7 @@ sliding the context window so that the model has more context when making each p
|
|||||||
This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more
|
This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more
|
||||||
favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good
|
favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good
|
||||||
practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by
|
practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by
|
||||||
1 token a time. This allows computation to procede much faster while still giving the model a large context to make
|
1 token a time. This allows computation to proceed much faster while still giving the model a large context to make
|
||||||
predictions at each step.
|
predictions at each step.
|
||||||
|
|
||||||
Example: Calculating perplexity with GPT-2 in 🤗 Transformers
|
Example: Calculating perplexity with GPT-2 in 🤗 Transformers
|
||||||
|
@ -305,7 +305,7 @@ Language modeling is the task of fitting a model to a corpus, which can be domai
|
|||||||
transformer-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling,
|
transformer-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling,
|
||||||
GPT-2 with causal language modeling.
|
GPT-2 with causal language modeling.
|
||||||
|
|
||||||
Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
|
Language modeling can be useful outside of pretraining as well, for example to shift the model distribution to be
|
||||||
domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset or
|
domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset or
|
||||||
on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
|
on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ class PlotArguments:
|
|||||||
)
|
)
|
||||||
plot_along_batch: bool = field(
|
plot_along_batch: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
|
metadata={"help": "Whether to plot along batch size or sequence length. Defaults to sequence length."},
|
||||||
)
|
)
|
||||||
is_time: bool = field(
|
is_time: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
|
@ -17,7 +17,7 @@ This folder contains the original code used to train Distil* as well as examples
|
|||||||
|
|
||||||
## What is Distil*
|
## What is Distil*
|
||||||
|
|
||||||
Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distilled-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||||
|
|
||||||
We have applied the same method to other Transformer architectures and released the weights:
|
We have applied the same method to other Transformer architectures and released the weights:
|
||||||
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
|
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
|
||||||
@ -57,7 +57,7 @@ Here are the results on the *test* sets for 6 of the languages available in XNLI
|
|||||||
|
|
||||||
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||||
|
|
||||||
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0).
|
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breaking changes compared to v1.1.0).
|
||||||
|
|
||||||
|
|
||||||
## How to use DistilBERT
|
## How to use DistilBERT
|
||||||
@ -111,7 +111,7 @@ python scripts/binarized_data.py \
|
|||||||
--dump_file data/binarized_text
|
--dump_file data/binarized_text
|
||||||
```
|
```
|
||||||
|
|
||||||
Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurrences of each tokens in the data:
|
Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smooths the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurrences of each tokens in the data:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/token_counts.py \
|
python scripts/token_counts.py \
|
||||||
@ -173,7 +173,7 @@ python -m torch.distributed.launch \
|
|||||||
--token_counts data/token_counts.bert-base-uncased.pickle
|
--token_counts data/token_counts.bert-base-uncased.pickle
|
||||||
```
|
```
|
||||||
|
|
||||||
**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
|
**Tips:** Starting distilled training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
|
||||||
|
|
||||||
Happy distillation!
|
Happy distillation!
|
||||||
|
|
||||||
|
@ -188,7 +188,7 @@ class Distiller:
|
|||||||
|
|
||||||
def prepare_batch_mlm(self, batch):
|
def prepare_batch_mlm(self, batch):
|
||||||
"""
|
"""
|
||||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
|
Prepare the batch: from the token_ids and the lengths, compute the attention mask and the masked label for MLM.
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
------
|
------
|
||||||
@ -200,7 +200,7 @@ class Distiller:
|
|||||||
-------
|
-------
|
||||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
|
mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels. There is a -100 where there is nothing to predict.
|
||||||
"""
|
"""
|
||||||
token_ids, lengths = batch
|
token_ids, lengths = batch
|
||||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||||
@ -253,7 +253,7 @@ class Distiller:
|
|||||||
|
|
||||||
def prepare_batch_clm(self, batch):
|
def prepare_batch_clm(self, batch):
|
||||||
"""
|
"""
|
||||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
|
Prepare the batch: from the token_ids and the lengths, compute the attention mask and the labels for CLM.
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
------
|
------
|
||||||
|
@ -86,7 +86,7 @@ if __name__ == "__main__":
|
|||||||
compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
|
compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
|
||||||
|
|
||||||
print(f"N layers selected for distillation: {std_idx}")
|
print(f"N layers selected for distillation: {std_idx}")
|
||||||
print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
|
print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
|
||||||
|
|
||||||
print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
|
print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
|
||||||
torch.save(compressed_sd, args.dump_checkpoint)
|
torch.save(compressed_sd, args.dump_checkpoint)
|
||||||
|
@ -21,7 +21,7 @@ You can also have a look at this fun *Explain Like I'm Five* introductory [slide
|
|||||||
|
|
||||||
One promise of extreme pruning is to obtain extremely small models that can be easily sent (and stored) on edge devices. By setting weights to 0., we reduce the amount of information we need to store, and thus decreasing the memory size. We are able to obtain extremely sparse fine-pruned models with movement pruning: ~95% of the dense performance with ~5% of total remaining weights in the BERT encoder.
|
One promise of extreme pruning is to obtain extremely small models that can be easily sent (and stored) on edge devices. By setting weights to 0., we reduce the amount of information we need to store, and thus decreasing the memory size. We are able to obtain extremely sparse fine-pruned models with movement pruning: ~95% of the dense performance with ~5% of total remaining weights in the BERT encoder.
|
||||||
|
|
||||||
In [this notebook](https://github.com/huggingface/transformers/blob/master/examples/movement-pruning/Saving_PruneBERT.ipynb), we showcase how we can leverage standard tools that exist out-of-the-box to efficiently store an extremely sparse question answering model (only 6% of total remaining weights in the encoder). We are able to reduce the memory size of the encoder **from the 340MB (the orignal dense BERT) to 11MB**, without any additional training of the model (every operation is performed *post fine-pruning*). It is sufficiently small to store it on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical) 📎!
|
In [this notebook](https://github.com/huggingface/transformers/blob/master/examples/movement-pruning/Saving_PruneBERT.ipynb), we showcase how we can leverage standard tools that exist out-of-the-box to efficiently store an extremely sparse question answering model (only 6% of total remaining weights in the encoder). We are able to reduce the memory size of the encoder **from the 340MB (the original dense BERT) to 11MB**, without any additional training of the model (every operation is performed *post fine-pruning*). It is sufficiently small to store it on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical) 📎!
|
||||||
|
|
||||||
While movement pruning does not directly optimize for memory footprint (but rather the number of non-null weights), we hypothetize that further memory compression ratios can be achieved with specific quantization aware trainings (see for instance [Q8BERT](https://arxiv.org/abs/1910.06188), [And the Bit Goes Down](https://arxiv.org/abs/1907.05686) or [Quant-Noise](https://arxiv.org/abs/2004.07320)).
|
While movement pruning does not directly optimize for memory footprint (but rather the number of non-null weights), we hypothetize that further memory compression ratios can be achieved with specific quantization aware trainings (see for instance [Q8BERT](https://arxiv.org/abs/1910.06188), [And the Bit Goes Down](https://arxiv.org/abs/1907.05686) or [Quant-Noise](https://arxiv.org/abs/2004.07320)).
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
Binarizers take a (real value) matrice as input and produce a binary (values in {0,1}) mask of the same shape.
|
Binarizers take a (real value) matrix as input and produce a binary (values in {0,1}) mask of the same shape.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -3,7 +3,8 @@
|
|||||||
python finetune_trainer.py \
|
python finetune_trainer.py \
|
||||||
--learning_rate=3e-5 \
|
--learning_rate=3e-5 \
|
||||||
--fp16 \
|
--fp16 \
|
||||||
--do_train --do_eval --do_predict --evaluate_during_training \
|
--do_train --do_eval --do_predict \
|
||||||
|
--evaluation_strategy steps \
|
||||||
--predict_with_generate \
|
--predict_with_generate \
|
||||||
--n_val 1000 \
|
--n_val 1000 \
|
||||||
"$@"
|
"$@"
|
||||||
|
@ -5,7 +5,8 @@ export TPU_NUM_CORES=8
|
|||||||
python xla_spawn.py --num_cores $TPU_NUM_CORES \
|
python xla_spawn.py --num_cores $TPU_NUM_CORES \
|
||||||
finetune_trainer.py \
|
finetune_trainer.py \
|
||||||
--learning_rate=3e-5 \
|
--learning_rate=3e-5 \
|
||||||
--do_train --do_eval --evaluate_during_training \
|
--do_train --do_eval \
|
||||||
|
--evaluation_strategy steps \
|
||||||
--prediction_loss_only \
|
--prediction_loss_only \
|
||||||
--n_val 1000 \
|
--n_val 1000 \
|
||||||
"$@"
|
"$@"
|
||||||
|
@ -16,7 +16,8 @@ python finetune_trainer.py \
|
|||||||
--num_train_epochs=6 \
|
--num_train_epochs=6 \
|
||||||
--save_steps 3000 --eval_steps 3000 \
|
--save_steps 3000 --eval_steps 3000 \
|
||||||
--max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
|
--max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
|
||||||
--do_train --do_eval --do_predict --evaluate_during_training\
|
--do_train --do_eval --do_predict \
|
||||||
|
--evaluation_strategy steps \
|
||||||
--predict_with_generate --logging_first_step \
|
--predict_with_generate --logging_first_step \
|
||||||
--task translation --label_smoothing 0.1 \
|
--task translation --label_smoothing 0.1 \
|
||||||
"$@"
|
"$@"
|
||||||
|
@ -17,7 +17,8 @@ python xla_spawn.py --num_cores $TPU_NUM_CORES \
|
|||||||
--save_steps 500 --eval_steps 500 \
|
--save_steps 500 --eval_steps 500 \
|
||||||
--logging_first_step --logging_steps 200 \
|
--logging_first_step --logging_steps 200 \
|
||||||
--max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
|
--max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
|
||||||
--do_train --do_eval --evaluate_during_training \
|
--do_train --do_eval \
|
||||||
|
--evaluation_strategy steps \
|
||||||
--prediction_loss_only \
|
--prediction_loss_only \
|
||||||
--task translation --label_smoothing 0.1 \
|
--task translation --label_smoothing 0.1 \
|
||||||
"$@"
|
"$@"
|
||||||
|
@ -19,6 +19,7 @@ python finetune_trainer.py \
|
|||||||
--save_steps 3000 --eval_steps 3000 \
|
--save_steps 3000 --eval_steps 3000 \
|
||||||
--logging_first_step \
|
--logging_first_step \
|
||||||
--max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
|
--max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
|
||||||
--do_train --do_eval --do_predict --evaluate_during_training \
|
--do_train --do_eval --do_predict \
|
||||||
|
--evaluation_strategy steps \
|
||||||
--predict_with_generate --sortish_sampler \
|
--predict_with_generate --sortish_sampler \
|
||||||
"$@"
|
"$@"
|
||||||
|
@ -15,7 +15,8 @@ python finetune_trainer.py \
|
|||||||
--sortish_sampler \
|
--sortish_sampler \
|
||||||
--num_train_epochs 6 \
|
--num_train_epochs 6 \
|
||||||
--save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
|
--save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
|
||||||
--do_train --do_eval --do_predict --evaluate_during_training \
|
--do_train --do_eval --do_predict \
|
||||||
--predict_with_generate --logging_first_step
|
--evaluation_strategy steps \
|
||||||
|
--predict_with_generate --logging_first_step \
|
||||||
--task translation \
|
--task translation \
|
||||||
"$@"
|
"$@"
|
||||||
|
@ -4,7 +4,7 @@ language: sv
|
|||||||
|
|
||||||
# Swedish BERT Models
|
# Swedish BERT Models
|
||||||
|
|
||||||
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on aproximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on approximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
||||||
|
|
||||||
The following three models are currently available:
|
The following three models are currently available:
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ for token in nlp(text):
|
|||||||
print(l)
|
print(l)
|
||||||
```
|
```
|
||||||
|
|
||||||
Which should result in the following (though less cleanly formated):
|
Which should result in the following (though less cleanly formatted):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
[ { 'word': 'Engelbert', 'score': 0.99..., 'entity': 'PRS'},
|
[ { 'word': 'Engelbert', 'score': 0.99..., 'entity': 'PRS'},
|
||||||
@ -104,7 +104,7 @@ Which should result in the following (though less cleanly formated):
|
|||||||
|
|
||||||
### ALBERT base
|
### ALBERT base
|
||||||
|
|
||||||
The easisest way to do this is, again, using Huggingface Transformers:
|
The easiest way to do this is, again, using Huggingface Transformers:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoModel,AutoTokenizer
|
from transformers import AutoModel,AutoTokenizer
|
||||||
|
@ -4,7 +4,7 @@ language: sv
|
|||||||
|
|
||||||
# Swedish BERT Models
|
# Swedish BERT Models
|
||||||
|
|
||||||
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on aproximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on approximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
||||||
|
|
||||||
The following three models are currently available:
|
The following three models are currently available:
|
||||||
|
|
||||||
@ -86,7 +86,7 @@ for token in nlp(text):
|
|||||||
print(l)
|
print(l)
|
||||||
```
|
```
|
||||||
|
|
||||||
Which should result in the following (though less cleanly formated):
|
Which should result in the following (though less cleanly formatted):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
[ { 'word': 'Engelbert', 'score': 0.99..., 'entity': 'PRS'},
|
[ { 'word': 'Engelbert', 'score': 0.99..., 'entity': 'PRS'},
|
||||||
@ -104,7 +104,7 @@ Which should result in the following (though less cleanly formated):
|
|||||||
|
|
||||||
### ALBERT base
|
### ALBERT base
|
||||||
|
|
||||||
The easisest way to do this is, again, using Huggingface Transformers:
|
The easiest way to do this is, again, using Huggingface Transformers:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoModel,AutoTokenizer
|
from transformers import AutoModel,AutoTokenizer
|
||||||
|
@ -4,7 +4,7 @@ tags:
|
|||||||
---
|
---
|
||||||
|
|
||||||
## CS224n SQuAD2.0 Project Dataset
|
## CS224n SQuAD2.0 Project Dataset
|
||||||
The goal of this model is to save CS224n students GPU time when establising
|
The goal of this model is to save CS224n students GPU time when establishing
|
||||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||||
The training set used to fine-tune this model is the same as
|
The training set used to fine-tune this model is the same as
|
||||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||||
|
@ -4,7 +4,7 @@ tags:
|
|||||||
---
|
---
|
||||||
|
|
||||||
## CS224n SQuAD2.0 Project Dataset
|
## CS224n SQuAD2.0 Project Dataset
|
||||||
The goal of this model is to save CS224n students GPU time when establising
|
The goal of this model is to save CS224n students GPU time when establishing
|
||||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||||
The training set used to fine-tune this model is the same as
|
The training set used to fine-tune this model is the same as
|
||||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
## CS224n SQuAD2.0 Project Dataset
|
## CS224n SQuAD2.0 Project Dataset
|
||||||
The goal of this model is to save CS224n students GPU time when establising
|
The goal of this model is to save CS224n students GPU time when establishing
|
||||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||||
The training set used to fine-tune this model is the same as
|
The training set used to fine-tune this model is the same as
|
||||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
## CS224n SQuAD2.0 Project Dataset
|
## CS224n SQuAD2.0 Project Dataset
|
||||||
The goal of this model is to save CS224n students GPU time when establising
|
The goal of this model is to save CS224n students GPU time when establishing
|
||||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||||
The training set used to fine-tune this model is the same as
|
The training set used to fine-tune this model is the same as
|
||||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the downstream task (Intent Prediction) - Dataset 📚
|
## Details of the downstream task (Intent Prediction) - Dataset 📚
|
||||||
|
|
||||||
Dataset ID: ```event2Mind``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
Dataset ID: ```event2Mind``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||||
|
|
||||||
Dataset ID: ```squad``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
Dataset ID: ```squad``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||||
|
|
||||||
Dataset ID: ```squad_v2``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
Dataset ID: ```squad_v2``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the Dataset 📚
|
## Details of the Dataset 📚
|
||||||
|
|
||||||
Dataset ID: ```wikisql``` from [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
Dataset ID: ```wikisql``` from [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the Dataset 📚
|
## Details of the Dataset 📚
|
||||||
|
|
||||||
Dataset ID: ```wikisql``` from [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
Dataset ID: ```wikisql``` from [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the downstream task (Question Paraphrasing) - Dataset 📚❓↔️❓
|
## Details of the downstream task (Question Paraphrasing) - Dataset 📚❓↔️❓
|
||||||
|
|
||||||
Dataset ID: ```quora``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
Dataset ID: ```quora``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||||
|
|
||||||
Dataset ID: ```squad``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
Dataset ID: ```squad``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||||
|
|
||||||
Dataset ID: ```squad_v2``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
Dataset ID: ```squad_v2``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
|||||||
|
|
||||||
## Details of the Dataset 📚
|
## Details of the Dataset 📚
|
||||||
|
|
||||||
Dataset ID: ```wikisql``` from [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
Dataset ID: ```wikisql``` from [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||||
|
|
||||||
| Dataset | Split | # samples |
|
| Dataset | Split | # samples |
|
||||||
| -------- | ----- | --------- |
|
| -------- | ----- | --------- |
|
||||||
|
2
setup.py
2
setup.py
@ -119,7 +119,7 @@ extras["dev"] = extras["all"] + extras["testing"] + extras["quality"] + extras["
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="transformers",
|
name="transformers",
|
||||||
version="4.0.0-rc-1",
|
version="4.0.0",
|
||||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||||
author_email="thomas@huggingface.co",
|
author_email="thomas@huggingface.co",
|
||||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||||
# module, but to preserve other warnings. So, don't check this module at all.
|
# module, but to preserve other warnings. So, don't check this module at all.
|
||||||
|
|
||||||
__version__ = "4.0.0-rc-1"
|
__version__ = "4.0.0"
|
||||||
|
|
||||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||||
# default Python logging output behavior when present.
|
# default Python logging output behavior when present.
|
||||||
@ -98,6 +98,7 @@ from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
|
|||||||
from .models.auto import (
|
from .models.auto import (
|
||||||
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
CONFIG_MAPPING,
|
CONFIG_MAPPING,
|
||||||
|
MODEL_NAMES_MAPPING,
|
||||||
TOKENIZER_MAPPING,
|
TOKENIZER_MAPPING,
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
@ -876,6 +877,7 @@ else:
|
|||||||
|
|
||||||
|
|
||||||
if is_flax_available():
|
if is_flax_available():
|
||||||
|
from .models.auto import FLAX_MODEL_MAPPING, FlaxAutoModel
|
||||||
from .models.bert import FlaxBertModel
|
from .models.bert import FlaxBertModel
|
||||||
from .models.roberta import FlaxRobertaModel
|
from .models.roberta import FlaxRobertaModel
|
||||||
else:
|
else:
|
||||||
|
@ -55,8 +55,6 @@ class PretrainedConfig(object):
|
|||||||
Whether or not the model should return all hidden-states.
|
Whether or not the model should return all hidden-states.
|
||||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not the model should returns all attentions.
|
Whether or not the model should returns all attentions.
|
||||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
|
||||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain
|
Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain
|
||||||
tuple.
|
tuple.
|
||||||
@ -168,7 +166,6 @@ class PretrainedConfig(object):
|
|||||||
self.return_dict = kwargs.pop("return_dict", True)
|
self.return_dict = kwargs.pop("return_dict", True)
|
||||||
self.output_hidden_states = kwargs.pop("output_hidden_states", False)
|
self.output_hidden_states = kwargs.pop("output_hidden_states", False)
|
||||||
self.output_attentions = kwargs.pop("output_attentions", False)
|
self.output_attentions = kwargs.pop("output_attentions", False)
|
||||||
self.use_cache = kwargs.pop("use_cache", True) # Not used by all models
|
|
||||||
self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models
|
self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models
|
||||||
self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
|
self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
|
||||||
self.pruned_heads = kwargs.pop("pruned_heads", {})
|
self.pruned_heads = kwargs.pop("pruned_heads", {})
|
||||||
|
@ -229,7 +229,7 @@ class LineByLineWithSOPTextDataset(Dataset):
|
|||||||
# to `block_size` anyways, so short sequences are generally wasted
|
# to `block_size` anyways, so short sequences are generally wasted
|
||||||
# computation. However, we *sometimes*
|
# computation. However, we *sometimes*
|
||||||
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
|
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
|
||||||
# sequences to minimize the mismatch between pre-training and fine-tuning.
|
# sequences to minimize the mismatch between pretraining and fine-tuning.
|
||||||
# The `target_seq_length` is just a rough target however, whereas
|
# The `target_seq_length` is just a rough target however, whereas
|
||||||
# `block_size` is a hard limit.
|
# `block_size` is a hard limit.
|
||||||
target_seq_length = max_num_tokens
|
target_seq_length = max_num_tokens
|
||||||
@ -425,7 +425,7 @@ class TextDatasetForNextSentencePrediction(Dataset):
|
|||||||
# to `block_size` anyways, so short sequences are generally wasted
|
# to `block_size` anyways, so short sequences are generally wasted
|
||||||
# computation. However, we *sometimes*
|
# computation. However, we *sometimes*
|
||||||
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
|
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
|
||||||
# sequences to minimize the mismatch between pre-training and fine-tuning.
|
# sequences to minimize the mismatch between pretraining and fine-tuning.
|
||||||
# The `target_seq_length` is just a rough target however, whereas
|
# The `target_seq_length` is just a rough target however, whereas
|
||||||
# `block_size` is a hard limit.
|
# `block_size` is a hard limit.
|
||||||
target_seq_length = max_num_tokens
|
target_seq_length = max_num_tokens
|
||||||
|
@ -203,8 +203,28 @@ except ImportError:
|
|||||||
_tokenizers_available = False
|
_tokenizers_available = False
|
||||||
|
|
||||||
|
|
||||||
default_cache_path = os.path.join(torch_cache_home, "transformers")
|
old_default_cache_path = os.path.join(torch_cache_home, "transformers")
|
||||||
|
# New default cache, shared with the Datasets library
|
||||||
|
hf_cache_home = os.path.expanduser(
|
||||||
|
os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
|
||||||
|
)
|
||||||
|
default_cache_path = os.path.join(hf_cache_home, "transformers")
|
||||||
|
|
||||||
|
# Onetime move from the old location to the new one if no ENV variable has been set.
|
||||||
|
if (
|
||||||
|
os.path.isdir(old_default_cache_path)
|
||||||
|
and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
|
||||||
|
and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
|
||||||
|
and "TRANSFORMERS_CACHE" not in os.environ
|
||||||
|
):
|
||||||
|
logger.warn(
|
||||||
|
"In Transformers v4.0.0, the default path to cache downloaded models changed from "
|
||||||
|
"'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
|
||||||
|
"and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
|
||||||
|
"'~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should "
|
||||||
|
"only see this message once."
|
||||||
|
)
|
||||||
|
shutil.move(old_default_cache_path, default_cache_path)
|
||||||
|
|
||||||
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
|
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
|
||||||
PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
|
PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
|
||||||
|
@ -38,6 +38,7 @@ class TFGenerationMixin:
|
|||||||
|
|
||||||
def _use_cache(self, outputs, use_cache):
|
def _use_cache(self, outputs, use_cache):
|
||||||
"""During generation, decide whether to pass the `past` variable to the next forward pass."""
|
"""During generation, decide whether to pass the `past` variable to the next forward pass."""
|
||||||
|
use_cache = getattr(self.config, "use_cache", False)
|
||||||
if len(outputs) <= 1 or use_cache is False:
|
if len(outputs) <= 1 or use_cache is False:
|
||||||
return False
|
return False
|
||||||
if hasattr(self.config, "mem_len") and self.config.mem_len == 0:
|
if hasattr(self.config, "mem_len") and self.config.mem_len == 0:
|
||||||
@ -194,7 +195,6 @@ class TFGenerationMixin:
|
|||||||
min_length = min_length if min_length is not None else self.config.min_length
|
min_length = min_length if min_length is not None else self.config.min_length
|
||||||
do_sample = do_sample if do_sample is not None else self.config.do_sample
|
do_sample = do_sample if do_sample is not None else self.config.do_sample
|
||||||
early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
|
early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
|
||||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
||||||
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
||||||
temperature = temperature if temperature is not None else self.config.temperature
|
temperature = temperature if temperature is not None else self.config.temperature
|
||||||
top_k = top_k if top_k is not None else self.config.top_k
|
top_k = top_k if top_k is not None else self.config.top_k
|
||||||
@ -224,7 +224,6 @@ class TFGenerationMixin:
|
|||||||
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
|
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
|
||||||
assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
|
assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
|
||||||
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
|
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
|
||||||
assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
|
|
||||||
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
|
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
|
||||||
assert temperature > 0, "`temperature` should be strictly positive."
|
assert temperature > 0, "`temperature` should be strictly positive."
|
||||||
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
|
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
|
||||||
|
@ -462,7 +462,6 @@ class GenerationMixin:
|
|||||||
pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
|
pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
|
||||||
bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
|
bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
|
||||||
eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
|
eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
|
||||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
||||||
|
|
||||||
if input_ids is None:
|
if input_ids is None:
|
||||||
# init `input_ids` with bos_token_id
|
# init `input_ids` with bos_token_id
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from .trainer_utils import EvaluationStrategy
|
||||||
from .utils import logging
|
from .utils import logging
|
||||||
|
|
||||||
|
|
||||||
@ -212,13 +213,13 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR
|
|||||||
# Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
|
# Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
|
||||||
if isinstance(
|
if isinstance(
|
||||||
kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
|
kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
|
||||||
) and (not trainer.args.do_eval or not trainer.args.evaluate_during_training):
|
) and (not trainer.args.do_eval or trainer.args.evaluation_strategy == EvaluationStrategy.NO):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"You are using {cls} as a scheduler but you haven't enabled evaluation during training. "
|
"You are using {cls} as a scheduler but you haven't enabled evaluation during training. "
|
||||||
"This means your trials will not report intermediate results to Ray Tune, and "
|
"This means your trials will not report intermediate results to Ray Tune, and "
|
||||||
"can thus not be stopped early or used to exploit other trials parameters. "
|
"can thus not be stopped early or used to exploit other trials parameters. "
|
||||||
"If this is what you want, do not use {cls}. If you would like to use {cls}, "
|
"If this is what you want, do not use {cls}. If you would like to use {cls}, "
|
||||||
"make sure you pass `do_eval=True` and `evaluate_during_training=True` in the "
|
"make sure you pass `do_eval=True` and `evaluation_strategy='steps'` in the "
|
||||||
"Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__)
|
"Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
|
|||||||
return tuple with:
|
return tuple with:
|
||||||
|
|
||||||
- pytorch model weight name
|
- pytorch model weight name
|
||||||
- transpose: boolean indicating wether TF2.0 and PyTorch weights matrices are transposed with regards to each
|
- transpose: boolean indicating whether TF2.0 and PyTorch weights matrices are transposed with regards to each
|
||||||
other
|
other
|
||||||
"""
|
"""
|
||||||
tf_name = tf_name.replace(":0", "") # device ids
|
tf_name = tf_name.replace(":0", "") # device ids
|
||||||
@ -164,9 +164,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
|||||||
if allow_missing_keys:
|
if allow_missing_keys:
|
||||||
missing_keys.append(name)
|
missing_keys.append(name)
|
||||||
continue
|
continue
|
||||||
elif tf_model.authorized_missing_keys is not None:
|
elif tf_model._keys_to_ignore_on_load_missing is not None:
|
||||||
# authorized missing keys don't have to be loaded
|
# authorized missing keys don't have to be loaded
|
||||||
if any(re.search(pat, name) is not None for pat in tf_model.authorized_missing_keys):
|
if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
raise AttributeError("{} not found in PyTorch model".format(name))
|
raise AttributeError("{} not found in PyTorch model".format(name))
|
||||||
@ -209,11 +209,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
|||||||
|
|
||||||
unexpected_keys = list(all_pytorch_weights)
|
unexpected_keys = list(all_pytorch_weights)
|
||||||
|
|
||||||
if tf_model.authorized_missing_keys is not None:
|
if tf_model._keys_to_ignore_on_load_missing is not None:
|
||||||
for pat in tf_model.authorized_missing_keys:
|
for pat in tf_model._keys_to_ignore_on_load_missing:
|
||||||
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
||||||
if tf_model.authorized_unexpected_keys is not None:
|
if tf_model._keys_to_ignore_on_load_unexpected is not None:
|
||||||
for pat in tf_model.authorized_unexpected_keys:
|
for pat in tf_model._keys_to_ignore_on_load_unexpected:
|
||||||
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
||||||
|
|
||||||
if len(unexpected_keys) > 0:
|
if len(unexpected_keys) > 0:
|
||||||
|
@ -343,15 +343,15 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
|||||||
:class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
|
:class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
|
||||||
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
|
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
|
||||||
derived classes of the same architecture adding modules on top of the base model.
|
derived classes of the same architecture adding modules on top of the base model.
|
||||||
- **authorized_missing_keys** (:obj:`List[str]`, `optional`) -- A list of re pattern of tensor names to ignore
|
|
||||||
from the model when loading the model weights (and avoid unnecessary warnings).
|
|
||||||
- **authorized_unexpected_keys** (:obj:`List[str]`, `optional`) -- A list of re pattern of tensor names to
|
|
||||||
ignore from the weights when loading the model weights (and avoid unnecessary warnings).
|
|
||||||
"""
|
"""
|
||||||
config_class = None
|
config_class = None
|
||||||
base_model_prefix = ""
|
base_model_prefix = ""
|
||||||
authorized_missing_keys = None
|
# a list of re pattern of tensor names to ignore from the model when loading the model weights
|
||||||
authorized_unexpected_keys = None
|
# (and avoid unnecessary warnings).
|
||||||
|
_keys_to_ignore_on_load_missing = None
|
||||||
|
# a list of re pattern of tensor names to ignore from the weights when loading the model weights
|
||||||
|
# (and avoid unnecessary warnings).
|
||||||
|
_keys_to_ignore_on_load_unexpected = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
||||||
@ -742,12 +742,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
|||||||
|
|
||||||
model(model.dummy_inputs, training=False) # Make sure restore ops are run
|
model(model.dummy_inputs, training=False) # Make sure restore ops are run
|
||||||
|
|
||||||
if cls.authorized_missing_keys is not None:
|
if cls._keys_to_ignore_on_load_missing is not None:
|
||||||
for pat in cls.authorized_missing_keys:
|
for pat in cls._keys_to_ignore_on_load_missing:
|
||||||
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
||||||
|
|
||||||
if cls.authorized_unexpected_keys is not None:
|
if cls._keys_to_ignore_on_load_unexpected is not None:
|
||||||
for pat in cls.authorized_unexpected_keys:
|
for pat in cls._keys_to_ignore_on_load_unexpected:
|
||||||
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
||||||
|
|
||||||
if len(unexpected_keys) > 0:
|
if len(unexpected_keys) > 0:
|
||||||
|
@ -404,17 +404,18 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
|
|||||||
|
|
||||||
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
|
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
|
||||||
derived classes of the same architecture adding modules on top of the base model.
|
derived classes of the same architecture adding modules on top of the base model.
|
||||||
- **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore
|
|
||||||
when loading the model (and avoid unnecessary warnings).
|
|
||||||
- **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore when saving the
|
|
||||||
model (useful for keys that aren't trained, but which are deterministic)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
config_class = None
|
config_class = None
|
||||||
base_model_prefix = ""
|
base_model_prefix = ""
|
||||||
authorized_missing_keys = None
|
# a list of re pattern of tensor names to ignore from the model when loading the model weights
|
||||||
authorized_unexpected_keys = None
|
# (and avoid unnecessary warnings).
|
||||||
keys_to_never_save = None
|
_keys_to_ignore_on_load_missing = None
|
||||||
|
# a list of re pattern of tensor names to ignore from the weights when loading the model weights
|
||||||
|
# (and avoid unnecessary warnings).
|
||||||
|
_keys_to_ignore_on_load_unexpected = None
|
||||||
|
# a list of of tensor names to ignore when saving the model (useful for keys that aren't
|
||||||
|
# trained, but which are deterministic)
|
||||||
|
_keys_to_ignore_on_save = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dummy_inputs(self) -> Dict[str, torch.Tensor]:
|
def dummy_inputs(self) -> Dict[str, torch.Tensor]:
|
||||||
@ -719,8 +720,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
|
|||||||
state_dict = model_to_save.state_dict()
|
state_dict = model_to_save.state_dict()
|
||||||
|
|
||||||
# Handle the case where some state_dict keys shouldn't be saved
|
# Handle the case where some state_dict keys shouldn't be saved
|
||||||
if self.keys_to_never_save is not None:
|
if self._keys_to_ignore_on_save is not None:
|
||||||
state_dict = {k: v for k, v in state_dict.items() if k not in self.keys_to_never_save}
|
state_dict = {k: v for k, v in state_dict.items() if k not in self._keys_to_ignore_on_save}
|
||||||
|
|
||||||
# If we save using the predefined names, we can load using `from_pretrained`
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
|
output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
|
||||||
@ -1034,12 +1035,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
|
|||||||
|
|
||||||
# Some models may have keys that are not in the state by design, removing them before needlessly warning
|
# Some models may have keys that are not in the state by design, removing them before needlessly warning
|
||||||
# the user.
|
# the user.
|
||||||
if cls.authorized_missing_keys is not None:
|
if cls._keys_to_ignore_on_load_missing is not None:
|
||||||
for pat in cls.authorized_missing_keys:
|
for pat in cls._keys_to_ignore_on_load_missing:
|
||||||
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
||||||
|
|
||||||
if cls.authorized_unexpected_keys is not None:
|
if cls._keys_to_ignore_on_load_unexpected is not None:
|
||||||
for pat in cls.authorized_unexpected_keys:
|
for pat in cls._keys_to_ignore_on_load_unexpected:
|
||||||
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
||||||
|
|
||||||
if len(unexpected_keys) > 0:
|
if len(unexpected_keys) > 0:
|
||||||
|
@ -214,6 +214,7 @@ class AlbertEmbeddings(nn.Module):
|
|||||||
|
|
||||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
|
||||||
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
|
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
|
||||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||||
@ -265,6 +266,11 @@ class AlbertAttention(nn.Module):
|
|||||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||||
self.pruned_heads = set()
|
self.pruned_heads = set()
|
||||||
|
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||||
|
self.max_position_embeddings = config.max_position_embeddings
|
||||||
|
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||||
|
|
||||||
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores
|
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores
|
||||||
def transpose_for_scores(self, x):
|
def transpose_for_scores(self, x):
|
||||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||||
@ -459,7 +465,7 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
config_class = AlbertConfig
|
config_class = AlbertConfig
|
||||||
base_model_prefix = "albert"
|
base_model_prefix = "albert"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
"""Initialize the weights."""
|
"""Initialize the weights."""
|
||||||
@ -705,7 +711,7 @@ class AlbertModel(AlbertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
|
Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
|
||||||
`sentence order prediction (classification)` head.
|
`sentence order prediction (classification)` head.
|
||||||
""",
|
""",
|
||||||
ALBERT_START_DOCSTRING,
|
ALBERT_START_DOCSTRING,
|
||||||
@ -851,7 +857,7 @@ class AlbertSOPHead(nn.Module):
|
|||||||
)
|
)
|
||||||
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1021,7 +1027,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class AlbertForTokenClassification(AlbertPreTrainedModel):
|
class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1110,7 +1116,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -768,7 +768,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Albert Model with two heads on top for pre-training: a `masked language modeling` head and a `sentence order
|
Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
|
||||||
prediction` (classification) head.
|
prediction` (classification) head.
|
||||||
""",
|
""",
|
||||||
ALBERT_START_DOCSTRING,
|
ALBERT_START_DOCSTRING,
|
||||||
@ -843,7 +843,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
|
|||||||
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
|
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
|
||||||
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -1013,7 +1013,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
|
|||||||
)
|
)
|
||||||
class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
|
class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -1100,7 +1100,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
|
|||||||
)
|
)
|
||||||
class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):
|
class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
@ -2,8 +2,8 @@
|
|||||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||||
# module, but to preserve other warnings. So, don't check this module at all.
|
# module, but to preserve other warnings. So, don't check this module at all.
|
||||||
|
|
||||||
from ...file_utils import is_tf_available, is_torch_available
|
from ...file_utils import is_flax_available, is_tf_available, is_torch_available
|
||||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig
|
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
|
||||||
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
|
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
@ -57,3 +57,6 @@ if is_tf_available():
|
|||||||
TFAutoModelForTokenClassification,
|
TFAutoModelForTokenClassification,
|
||||||
TFAutoModelWithLMHead,
|
TFAutoModelWithLMHead,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if is_flax_available():
|
||||||
|
from .modeling_flax_auto import FLAX_MODEL_MAPPING, FlaxAutoModel
|
||||||
|
@ -36,7 +36,7 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
|
|||||||
for key, value, in pretrained_map.items()
|
for key, value, in pretrained_map.items()
|
||||||
)
|
)
|
||||||
|
|
||||||
MODEL_MAPPING = OrderedDict(
|
FLAX_MODEL_MAPPING = OrderedDict(
|
||||||
[
|
[
|
||||||
(RobertaConfig, FlaxRobertaModel),
|
(RobertaConfig, FlaxRobertaModel),
|
||||||
(BertConfig, FlaxBertModel),
|
(BertConfig, FlaxBertModel),
|
||||||
@ -79,13 +79,13 @@ class FlaxAutoModel(object):
|
|||||||
model = FlaxAutoModel.from_config(config)
|
model = FlaxAutoModel.from_config(config)
|
||||||
# E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
# E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||||
"""
|
"""
|
||||||
for config_class, model_class in MODEL_MAPPING.items():
|
for config_class, model_class in FLAX_MODEL_MAPPING.items():
|
||||||
if isinstance(config, config_class):
|
if isinstance(config, config_class):
|
||||||
return model_class(config)
|
return model_class(config)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unrecognized configuration class {config.__class__} "
|
f"Unrecognized configuration class {config.__class__} "
|
||||||
f"for this kind of FlaxAutoModel: {cls.__name__}.\n"
|
f"for this kind of FlaxAutoModel: {cls.__name__}.\n"
|
||||||
f"Model type should be one of {', '.join(c.__name__ for c in MODEL_MAPPING.keys())}."
|
f"Model type should be one of {', '.join(c.__name__ for c in FLAX_MODEL_MAPPING.keys())}."
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -173,11 +173,11 @@ class FlaxAutoModel(object):
|
|||||||
if not isinstance(config, PretrainedConfig):
|
if not isinstance(config, PretrainedConfig):
|
||||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
for config_class, model_class in MODEL_MAPPING.items():
|
for config_class, model_class in FLAX_MODEL_MAPPING.items():
|
||||||
if isinstance(config, config_class):
|
if isinstance(config, config_class):
|
||||||
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
|
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unrecognized configuration class {config.__class__} "
|
f"Unrecognized configuration class {config.__class__} "
|
||||||
f"for this kind of FlaxAutoModel: {cls.__name__}.\n"
|
f"for this kind of FlaxAutoModel: {cls.__name__}.\n"
|
||||||
f"Model type should be one of {', '.join(c.__name__ for c in MODEL_MAPPING.keys())}"
|
f"Model type should be one of {', '.join(c.__name__ for c in FLAX_MODEL_MAPPING.keys())}"
|
||||||
)
|
)
|
||||||
|
@ -72,6 +72,7 @@ from .configuration_auto import (
|
|||||||
MarianConfig,
|
MarianConfig,
|
||||||
MBartConfig,
|
MBartConfig,
|
||||||
MobileBertConfig,
|
MobileBertConfig,
|
||||||
|
MT5Config,
|
||||||
OpenAIGPTConfig,
|
OpenAIGPTConfig,
|
||||||
PegasusConfig,
|
PegasusConfig,
|
||||||
ProphetNetConfig,
|
ProphetNetConfig,
|
||||||
@ -173,6 +174,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
|||||||
[
|
[
|
||||||
(RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
|
(RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
|
||||||
(T5Config, (T5Tokenizer, T5TokenizerFast)),
|
(T5Config, (T5Tokenizer, T5TokenizerFast)),
|
||||||
|
(MT5Config, (T5Tokenizer, T5TokenizerFast)),
|
||||||
(MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
|
(MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
|
||||||
(DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
|
(DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
|
||||||
(AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
|
(AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
|
||||||
|
@ -108,6 +108,8 @@ class BartConfig(PretrainedConfig):
|
|||||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
|
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
|
||||||
:obj:`True` for `bart-large-cnn`.
|
:obj:`True` for `bart-large-cnn`.
|
||||||
|
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||||
"""
|
"""
|
||||||
model_type = "bart"
|
model_type = "bart"
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
@ -134,9 +136,6 @@ class BartConfig(PretrainedConfig):
|
|||||||
classifier_dropout=0.0,
|
classifier_dropout=0.0,
|
||||||
num_labels=3,
|
num_labels=3,
|
||||||
is_encoder_decoder=True,
|
is_encoder_decoder=True,
|
||||||
pad_token_id=1,
|
|
||||||
bos_token_id=0,
|
|
||||||
eos_token_id=2,
|
|
||||||
normalize_before=False,
|
normalize_before=False,
|
||||||
add_final_layer_norm=False,
|
add_final_layer_norm=False,
|
||||||
do_blenderbot_90_layernorm=False,
|
do_blenderbot_90_layernorm=False,
|
||||||
@ -145,6 +144,10 @@ class BartConfig(PretrainedConfig):
|
|||||||
static_position_embeddings=False,
|
static_position_embeddings=False,
|
||||||
add_bias_logits=False,
|
add_bias_logits=False,
|
||||||
force_bos_token_to_be_generated=False,
|
force_bos_token_to_be_generated=False,
|
||||||
|
use_cache=True,
|
||||||
|
pad_token_id=1,
|
||||||
|
bos_token_id=0,
|
||||||
|
eos_token_id=2,
|
||||||
**common_kwargs
|
**common_kwargs
|
||||||
):
|
):
|
||||||
r"""
|
r"""
|
||||||
@ -208,6 +211,8 @@ class BartConfig(PretrainedConfig):
|
|||||||
|
|
||||||
self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
|
self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
|
||||||
|
|
||||||
|
self.use_cache = use_cache
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_attention_heads(self) -> int:
|
def num_attention_heads(self) -> int:
|
||||||
return self.encoder_attention_heads
|
return self.encoder_attention_heads
|
||||||
|
@ -946,7 +946,7 @@ class BartModel(PretrainedBartModel):
|
|||||||
)
|
)
|
||||||
class BartForConditionalGeneration(PretrainedBartModel):
|
class BartForConditionalGeneration(PretrainedBartModel):
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
authorized_missing_keys = [r"final_logits_bias", r"encoder\.version", r"decoder\.version"]
|
_keys_to_ignore_on_load_missing = [r"final_logits_bias", r"encoder\.version", r"decoder\.version"]
|
||||||
|
|
||||||
def __init__(self, config: BartConfig):
|
def __init__(self, config: BartConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -1020,10 +1020,10 @@ class TFBartModel(TFPretrainedBartModel):
|
|||||||
)
|
)
|
||||||
class TFBartForConditionalGeneration(TFPretrainedBartModel):
|
class TFBartForConditionalGeneration(TFPretrainedBartModel):
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
authorized_missing_keys = [
|
_keys_to_ignore_on_load_missing = [
|
||||||
r"final_logits_bias",
|
r"final_logits_bias",
|
||||||
]
|
]
|
||||||
authorized_unexpected_keys = [
|
_keys_to_ignore_on_load_unexpected = [
|
||||||
r"model.encoder.embed_tokens.weight",
|
r"model.encoder.embed_tokens.weight",
|
||||||
r"model.decoder.embed_tokens.weight",
|
r"model.decoder.embed_tokens.weight",
|
||||||
]
|
]
|
||||||
|
@ -178,6 +178,7 @@ class BertEmbeddings(nn.Module):
|
|||||||
|
|
||||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
|
||||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
@ -222,6 +223,10 @@ class BertSelfAttention(nn.Module):
|
|||||||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||||
|
|
||||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||||
|
self.max_position_embeddings = config.max_position_embeddings
|
||||||
|
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||||
|
|
||||||
def transpose_for_scores(self, x):
|
def transpose_for_scores(self, x):
|
||||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||||
@ -598,7 +603,7 @@ class BertPreTrainedModel(PreTrainedModel):
|
|||||||
config_class = BertConfig
|
config_class = BertConfig
|
||||||
load_tf_weights = load_tf_weights_in_bert
|
load_tf_weights = load_tf_weights_in_bert
|
||||||
base_model_prefix = "bert"
|
base_model_prefix = "bert"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights """
|
""" Initialize the weights """
|
||||||
@ -864,7 +869,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next
|
Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
|
||||||
sentence prediction (classification)` head.
|
sentence prediction (classification)` head.
|
||||||
""",
|
""",
|
||||||
BERT_START_DOCSTRING,
|
BERT_START_DOCSTRING,
|
||||||
@ -969,8 +974,8 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class BertLMHeadModel(BertPreTrainedModel):
|
class BertLMHeadModel(BertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
authorized_missing_keys = [r"position_ids", r"predictions.decoder.bias"]
|
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1087,8 +1092,8 @@ class BertLMHeadModel(BertPreTrainedModel):
|
|||||||
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
||||||
class BertForMaskedLM(BertPreTrainedModel):
|
class BertForMaskedLM(BertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
authorized_missing_keys = [r"position_ids", r"predictions.decoder.bias"]
|
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1469,7 +1474,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class BertForTokenClassification(BertPreTrainedModel):
|
class BertForTokenClassification(BertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1560,7 +1565,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class BertForQuestionAnswering(BertPreTrainedModel):
|
class BertForQuestionAnswering(BertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -183,6 +183,10 @@ class FlaxBertAttention(nn.Module):
|
|||||||
|
|
||||||
@nn.compact
|
@nn.compact
|
||||||
def __call__(self, hidden_state, attention_mask):
|
def __call__(self, hidden_state, attention_mask):
|
||||||
|
# Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
|
||||||
|
# FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
|
||||||
|
# with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
|
||||||
|
attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
|
||||||
self_att = nn.attention.SelfAttention(num_heads=self.num_heads, qkv_features=self.head_size, name="self")(
|
self_att = nn.attention.SelfAttention(num_heads=self.num_heads, qkv_features=self.head_size, name="self")(
|
||||||
hidden_state, attention_mask
|
hidden_state, attention_mask
|
||||||
)
|
)
|
||||||
|
@ -91,7 +91,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||||||
|
|
||||||
class TFBertPreTrainingLoss:
|
class TFBertPreTrainingLoss:
|
||||||
"""
|
"""
|
||||||
Loss function suitable for BERT-like pre-training, that is, the task of pretraining a language model by combining
|
Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining
|
||||||
NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
|
NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
|
||||||
computation.
|
computation.
|
||||||
"""
|
"""
|
||||||
@ -842,7 +842,7 @@ class TFBertModel(TFBertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Bert Model with two heads on top as done during the pre-training:
|
Bert Model with two heads on top as done during the pretraining:
|
||||||
a `masked language modeling` head and a `next sentence prediction (classification)` head.
|
a `masked language modeling` head and a `next sentence prediction (classification)` head.
|
||||||
""",
|
""",
|
||||||
BERT_START_DOCSTRING,
|
BERT_START_DOCSTRING,
|
||||||
@ -938,8 +938,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
|
|||||||
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
||||||
class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -1023,8 +1023,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
|||||||
|
|
||||||
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
|
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -1416,8 +1416,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
|
|||||||
)
|
)
|
||||||
class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationLoss):
|
class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationLoss):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -1502,8 +1502,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
|
|||||||
)
|
)
|
||||||
class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss):
|
class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
@ -173,7 +173,7 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
config_class = BertGenerationConfig
|
config_class = BertGenerationConfig
|
||||||
base_model_prefix = "bert"
|
base_model_prefix = "bert"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights """
|
""" Initialize the weights """
|
||||||
|
@ -80,7 +80,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
|
|||||||
normalization (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
normalization (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
||||||
Whether or not to apply a normalization preprocess.
|
Whether or not to apply a normalization preprocess.
|
||||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||||
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
@ -61,6 +61,9 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
The epsilon to use in the layer normalization layers
|
The epsilon to use in the layer normalization layers
|
||||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||||
|
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@ -98,6 +101,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
summary_proj_to_labels=True,
|
summary_proj_to_labels=True,
|
||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
|
use_cache=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@ -119,6 +123,7 @@ class CTRLConfig(PretrainedConfig):
|
|||||||
self.summary_activation = summary_activation
|
self.summary_activation = summary_activation
|
||||||
self.summary_first_dropout = summary_first_dropout
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
|
self.use_cache = use_cache
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_position_embeddings(self):
|
def max_position_embeddings(self):
|
||||||
|
@ -756,7 +756,7 @@ class DebertaPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
config_class = DebertaConfig
|
config_class = DebertaConfig
|
||||||
base_model_prefix = "deberta"
|
base_model_prefix = "deberta"
|
||||||
authorized_missing_keys = ["position_ids"]
|
_keys_to_ignore_on_load_missing = ["position_ids"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights """
|
""" Initialize the weights """
|
||||||
@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
|
|||||||
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
|
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
|
||||||
<https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
|
<https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
|
||||||
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
||||||
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
|
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
|
||||||
|
|
||||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||||
|
@ -71,6 +71,13 @@ class DPRConfig(PretrainedConfig):
|
|||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||||
|
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
|
||||||
|
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
|
||||||
|
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
|
||||||
|
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
|
||||||
|
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
|
||||||
|
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
|
||||||
|
<https://arxiv.org/abs/2009.13658>`__.
|
||||||
projection_dim (:obj:`int`, `optional`, defaults to 0):
|
projection_dim (:obj:`int`, `optional`, defaults to 0):
|
||||||
Dimension of the projection for the context and question encoders. If it is set to zero (default), then no
|
Dimension of the projection for the context and question encoders. If it is set to zero (default), then no
|
||||||
projection is done.
|
projection is done.
|
||||||
@ -93,6 +100,7 @@ class DPRConfig(PretrainedConfig):
|
|||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
pad_token_id=0,
|
pad_token_id=0,
|
||||||
gradient_checkpointing=False,
|
gradient_checkpointing=False,
|
||||||
|
position_embedding_type="absolute",
|
||||||
projection_dim: int = 0,
|
projection_dim: int = 0,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
@ -112,3 +120,4 @@ class DPRConfig(PretrainedConfig):
|
|||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.gradient_checkpointing = gradient_checkpointing
|
self.gradient_checkpointing = gradient_checkpointing
|
||||||
self.projection_dim = projection_dim
|
self.projection_dim = projection_dim
|
||||||
|
self.position_embedding_type = position_embedding_type
|
||||||
|
@ -279,7 +279,7 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
|
|||||||
config_class = DPRConfig
|
config_class = DPRConfig
|
||||||
load_tf_weights = None
|
load_tf_weights = None
|
||||||
base_model_prefix = "ctx_encoder"
|
base_model_prefix = "ctx_encoder"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def init_weights(self):
|
def init_weights(self):
|
||||||
self.ctx_encoder.init_weights()
|
self.ctx_encoder.init_weights()
|
||||||
@ -294,7 +294,7 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
|
|||||||
config_class = DPRConfig
|
config_class = DPRConfig
|
||||||
load_tf_weights = None
|
load_tf_weights = None
|
||||||
base_model_prefix = "question_encoder"
|
base_model_prefix = "question_encoder"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def init_weights(self):
|
def init_weights(self):
|
||||||
self.question_encoder.init_weights()
|
self.question_encoder.init_weights()
|
||||||
@ -309,7 +309,7 @@ class DPRPretrainedReader(PreTrainedModel):
|
|||||||
config_class = DPRConfig
|
config_class = DPRConfig
|
||||||
load_tf_weights = None
|
load_tf_weights = None
|
||||||
base_model_prefix = "span_predictor"
|
base_model_prefix = "span_predictor"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def init_weights(self):
|
def init_weights(self):
|
||||||
self.span_predictor.encoder.init_weights()
|
self.span_predictor.encoder.init_weights()
|
||||||
|
@ -165,6 +165,7 @@ class ElectraEmbeddings(nn.Module):
|
|||||||
|
|
||||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
|
||||||
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
|
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
|
||||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||||
@ -211,6 +212,10 @@ class ElectraSelfAttention(nn.Module):
|
|||||||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||||
|
|
||||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||||
|
self.max_position_embeddings = config.max_position_embeddings
|
||||||
|
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||||
|
|
||||||
def transpose_for_scores(self, x):
|
def transpose_for_scores(self, x):
|
||||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||||
@ -544,8 +549,8 @@ class ElectraPreTrainedModel(PreTrainedModel):
|
|||||||
config_class = ElectraConfig
|
config_class = ElectraConfig
|
||||||
load_tf_weights = load_tf_weights_in_electra
|
load_tf_weights = load_tf_weights_in_electra
|
||||||
base_model_prefix = "electra"
|
base_model_prefix = "electra"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
authorized_unexpected_keys = [r"electra\.embeddings_project\.weight", r"electra\.embeddings_project\.bias"]
|
_keys_to_ignore_on_load_unexpected = [r"electra\.embeddings_project\.weight", r"electra\.embeddings_project\.bias"]
|
||||||
|
|
||||||
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
|
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
@ -867,8 +872,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Electra model with a binary classification head on top as used during pre-training for identifying generated
|
Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
|
||||||
tokens.
|
|
||||||
|
|
||||||
It is recommended to load the discriminator checkpoint into that model.
|
It is recommended to load the discriminator checkpoint into that model.
|
||||||
""",
|
""",
|
||||||
|
@ -734,8 +734,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Electra model with a binary classification head on top as used during pre-training for identifying generated
|
Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
|
||||||
tokens.
|
|
||||||
|
|
||||||
Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model
|
Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model
|
||||||
of the two to have the correct classification head to be used for this model.
|
of the two to have the correct classification head to be used for this model.
|
||||||
|
@ -109,6 +109,8 @@ class FSMTConfig(PretrainedConfig):
|
|||||||
early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
||||||
Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop the beam
|
Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop the beam
|
||||||
search when at least ``num_beams`` sentences are finished per batch or not.
|
search when at least ``num_beams`` sentences are finished per batch or not.
|
||||||
|
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
@ -142,9 +144,6 @@ class FSMTConfig(PretrainedConfig):
|
|||||||
dropout=0.1,
|
dropout=0.1,
|
||||||
activation_dropout=0.0,
|
activation_dropout=0.0,
|
||||||
init_std=0.02,
|
init_std=0.02,
|
||||||
pad_token_id=1,
|
|
||||||
bos_token_id=0,
|
|
||||||
eos_token_id=2,
|
|
||||||
decoder_start_token_id=2,
|
decoder_start_token_id=2,
|
||||||
is_encoder_decoder=True,
|
is_encoder_decoder=True,
|
||||||
scale_embedding=True,
|
scale_embedding=True,
|
||||||
@ -152,6 +151,10 @@ class FSMTConfig(PretrainedConfig):
|
|||||||
num_beams=5,
|
num_beams=5,
|
||||||
length_penalty=1.0,
|
length_penalty=1.0,
|
||||||
early_stopping=False,
|
early_stopping=False,
|
||||||
|
use_cache=True,
|
||||||
|
pad_token_id=1,
|
||||||
|
bos_token_id=0,
|
||||||
|
eos_token_id=2,
|
||||||
**common_kwargs
|
**common_kwargs
|
||||||
):
|
):
|
||||||
if "hidden_size" in common_kwargs:
|
if "hidden_size" in common_kwargs:
|
||||||
@ -196,6 +199,8 @@ class FSMTConfig(PretrainedConfig):
|
|||||||
self.activation_dropout = activation_dropout
|
self.activation_dropout = activation_dropout
|
||||||
self.dropout = dropout
|
self.dropout = dropout
|
||||||
|
|
||||||
|
self.use_cache = use_cache
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_attention_heads(self) -> int:
|
def num_attention_heads(self) -> int:
|
||||||
return self.encoder_attention_heads
|
return self.encoder_attention_heads
|
||||||
|
@ -951,7 +951,7 @@ class FSMTModel(PretrainedFSMTModel):
|
|||||||
output_hidden_states=output_hidden_states,
|
output_hidden_states=output_hidden_states,
|
||||||
return_dict=return_dict,
|
return_dict=return_dict,
|
||||||
)
|
)
|
||||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
|
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
|
||||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||||
encoder_outputs = BaseModelOutput(
|
encoder_outputs = BaseModelOutput(
|
||||||
last_hidden_state=encoder_outputs[0],
|
last_hidden_state=encoder_outputs[0],
|
||||||
@ -1005,11 +1005,11 @@ class FSMTModel(PretrainedFSMTModel):
|
|||||||
)
|
)
|
||||||
class FSMTForConditionalGeneration(PretrainedFSMTModel):
|
class FSMTForConditionalGeneration(PretrainedFSMTModel):
|
||||||
base_model_prefix = "model"
|
base_model_prefix = "model"
|
||||||
authorized_missing_keys = [
|
_keys_to_ignore_on_load_missing = [
|
||||||
"model.encoder.embed_positions.weight",
|
"model.encoder.embed_positions.weight",
|
||||||
"model.decoder.embed_positions.weight",
|
"model.decoder.embed_positions.weight",
|
||||||
]
|
]
|
||||||
keys_to_never_save = [
|
_keys_to_ignore_on_save = [
|
||||||
"model.encoder.embed_positions.weight",
|
"model.encoder.embed_positions.weight",
|
||||||
"model.decoder.embed_positions.weight",
|
"model.decoder.embed_positions.weight",
|
||||||
]
|
]
|
||||||
|
@ -1181,7 +1181,7 @@ class TFFunnelModel(TFFunnelPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Funnel model with a binary classification head on top as used during pre-training for identifying generated tokens.
|
Funnel model with a binary classification head on top as used during pretraining for identifying generated tokens.
|
||||||
""",
|
""",
|
||||||
FUNNEL_START_DOCSTRING,
|
FUNNEL_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
|
@ -104,6 +104,8 @@ class GPT2Config(PretrainedConfig):
|
|||||||
The dropout ratio to be used after the projection and activation.
|
The dropout ratio to be used after the projection and activation.
|
||||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.
|
Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||||
|
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
@ -142,9 +144,10 @@ class GPT2Config(PretrainedConfig):
|
|||||||
summary_activation=None,
|
summary_activation=None,
|
||||||
summary_proj_to_labels=True,
|
summary_proj_to_labels=True,
|
||||||
summary_first_dropout=0.1,
|
summary_first_dropout=0.1,
|
||||||
|
gradient_checkpointing=False,
|
||||||
|
use_cache=True,
|
||||||
bos_token_id=50256,
|
bos_token_id=50256,
|
||||||
eos_token_id=50256,
|
eos_token_id=50256,
|
||||||
gradient_checkpointing=False,
|
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||||
@ -168,6 +171,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
self.summary_first_dropout = summary_first_dropout
|
self.summary_first_dropout = summary_first_dropout
|
||||||
self.summary_proj_to_labels = summary_proj_to_labels
|
self.summary_proj_to_labels = summary_proj_to_labels
|
||||||
self.gradient_checkpointing = gradient_checkpointing
|
self.gradient_checkpointing = gradient_checkpointing
|
||||||
|
self.use_cache = use_cache
|
||||||
|
|
||||||
self.bos_token_id = bos_token_id
|
self.bos_token_id = bos_token_id
|
||||||
self.eos_token_id = eos_token_id
|
self.eos_token_id = eos_token_id
|
||||||
|
@ -685,7 +685,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
GPT2_START_DOCSTRING,
|
GPT2_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class GPT2LMHeadModel(GPT2PreTrainedModel):
|
class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||||
authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -975,7 +975,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
GPT2_START_DOCSTRING,
|
GPT2_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
|
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
|
||||||
authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -146,6 +146,10 @@ class LayoutLMSelfAttention(nn.Module):
|
|||||||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||||
|
|
||||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||||
|
self.max_position_embeddings = config.max_position_embeddings
|
||||||
|
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||||
|
|
||||||
def transpose_for_scores(self, x):
|
def transpose_for_scores(self, x):
|
||||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||||
@ -509,7 +513,7 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
config_class = LayoutLMConfig
|
config_class = LayoutLMConfig
|
||||||
base_model_prefix = "layoutlm"
|
base_model_prefix = "layoutlm"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights """
|
""" Initialize the weights """
|
||||||
|
@ -460,6 +460,7 @@ class LongformerEmbeddings(nn.Module):
|
|||||||
|
|
||||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||||
|
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||||
|
|
||||||
# End copy
|
# End copy
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
@ -1303,7 +1304,7 @@ class LongformerPreTrainedModel(PreTrainedModel):
|
|||||||
|
|
||||||
config_class = LongformerConfig
|
config_class = LongformerConfig
|
||||||
base_model_prefix = "longformer"
|
base_model_prefix = "longformer"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights """
|
""" Initialize the weights """
|
||||||
@ -1621,7 +1622,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
|||||||
@add_start_docstrings("""Longformer Model with a `language modeling` head on top. """, LONGFORMER_START_DOCSTRING)
|
@add_start_docstrings("""Longformer Model with a `language modeling` head on top. """, LONGFORMER_START_DOCSTRING)
|
||||||
class LongformerForMaskedLM(LongformerPreTrainedModel):
|
class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1718,7 +1719,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class LongformerForSequenceClassification(LongformerPreTrainedModel):
|
class LongformerForSequenceClassification(LongformerPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1827,7 +1828,7 @@ class LongformerClassificationHead(nn.Module):
|
|||||||
)
|
)
|
||||||
class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1961,7 +1962,7 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class LongformerForTokenClassification(LongformerPreTrainedModel):
|
class LongformerForTokenClassification(LongformerPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -1961,7 +1961,7 @@ class TFLongformerModel(TFLongformerPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModelingLoss):
|
class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -2048,7 +2048,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
|
|||||||
)
|
)
|
||||||
class TFLongformerForQuestionAnswering(TFLongformerPreTrainedModel, TFQuestionAnsweringLoss):
|
class TFLongformerForQuestionAnswering(TFLongformerPreTrainedModel, TFQuestionAnsweringLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -2199,7 +2199,7 @@ class TFLongformerClassificationHead(tf.keras.layers.Layer):
|
|||||||
)
|
)
|
||||||
class TFLongformerForSequenceClassification(TFLongformerPreTrainedModel, TFSequenceClassificationLoss):
|
class TFLongformerForSequenceClassification(TFLongformerPreTrainedModel, TFSequenceClassificationLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -2443,7 +2443,7 @@ class TFLongformerForMultipleChoice(TFLongformerPreTrainedModel, TFMultipleChoic
|
|||||||
)
|
)
|
||||||
class TFLongformerForTokenClassification(TFLongformerPreTrainedModel, TFTokenClassificationLoss):
|
class TFLongformerForTokenClassification(TFLongformerPreTrainedModel, TFTokenClassificationLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
@ -1013,7 +1013,7 @@ class LxmertModel(LxmertPreTrainedModel):
|
|||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""Lxmert Model with a specified pre-training head on top. """,
|
"""Lxmert Model with a specified pretraining head on top. """,
|
||||||
LXMERT_START_DOCSTRING,
|
LXMERT_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class LxmertForPreTraining(LxmertPreTrainedModel):
|
class LxmertForPreTraining(LxmertPreTrainedModel):
|
||||||
@ -1024,7 +1024,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
|
|||||||
self.num_qa_labels = config.num_qa_labels
|
self.num_qa_labels = config.num_qa_labels
|
||||||
self.visual_loss_normalizer = config.visual_loss_normalizer
|
self.visual_loss_normalizer = config.visual_loss_normalizer
|
||||||
|
|
||||||
# Use of pre-training tasks
|
# Use of pretraining tasks
|
||||||
self.task_mask_lm = config.task_mask_lm
|
self.task_mask_lm = config.task_mask_lm
|
||||||
self.task_obj_predict = config.task_obj_predict
|
self.task_obj_predict = config.task_obj_predict
|
||||||
self.task_matched = config.task_matched
|
self.task_matched = config.task_matched
|
||||||
|
@ -1139,7 +1139,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
|
|||||||
self.num_qa_labels = config.num_qa_labels
|
self.num_qa_labels = config.num_qa_labels
|
||||||
self.visual_loss_normalizer = config.visual_loss_normalizer
|
self.visual_loss_normalizer = config.visual_loss_normalizer
|
||||||
|
|
||||||
# Use of pre-training tasks
|
# Use of pretraining tasks
|
||||||
self.task_mask_lm = config.task_mask_lm
|
self.task_mask_lm = config.task_mask_lm
|
||||||
self.task_obj_predict = config.task_obj_predict
|
self.task_obj_predict = config.task_obj_predict
|
||||||
self.task_matched = config.task_matched
|
self.task_matched = config.task_matched
|
||||||
|
@ -47,11 +47,11 @@ class MarianMTModel(BartForConditionalGeneration):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
config_class = MarianConfig
|
config_class = MarianConfig
|
||||||
authorized_missing_keys = [
|
_keys_to_ignore_on_load_missing = [
|
||||||
"model.encoder.embed_positions.weight",
|
"model.encoder.embed_positions.weight",
|
||||||
"model.decoder.embed_positions.weight",
|
"model.decoder.embed_positions.weight",
|
||||||
]
|
]
|
||||||
keys_to_never_save = [
|
_keys_to_ignore_on_save = [
|
||||||
"model.encoder.embed_positions.weight",
|
"model.encoder.embed_positions.weight",
|
||||||
"model.decoder.embed_positions.weight",
|
"model.decoder.embed_positions.weight",
|
||||||
]
|
]
|
||||||
|
@ -37,7 +37,7 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
@add_start_docstrings("Marian model for machine translation", START_DOCSTRING)
|
@add_start_docstrings("Marian model for machine translation", START_DOCSTRING)
|
||||||
class TFMarianMTModel(TFBartForConditionalGeneration):
|
class TFMarianMTModel(TFBartForConditionalGeneration):
|
||||||
authorized_missing_keys = [
|
_keys_to_ignore_on_load_missing = [
|
||||||
r"model.encoder.embed_positions.weight",
|
r"model.encoder.embed_positions.weight",
|
||||||
r"model.decoder.embed_positions.weight",
|
r"model.decoder.embed_positions.weight",
|
||||||
]
|
]
|
||||||
|
@ -29,11 +29,11 @@ class MBartForConditionalGeneration(BartForConditionalGeneration):
|
|||||||
"""
|
"""
|
||||||
model_type = "mbart"
|
model_type = "mbart"
|
||||||
config_class = MBartConfig
|
config_class = MBartConfig
|
||||||
authorized_missing_keys = [
|
_keys_to_ignore_on_load_missing = [
|
||||||
"model.encoder.embed_positions.weight",
|
"model.encoder.embed_positions.weight",
|
||||||
"model.decoder.embed_positions.weight",
|
"model.decoder.embed_positions.weight",
|
||||||
]
|
]
|
||||||
keys_to_never_save = [
|
_keys_to_ignore_on_save = [
|
||||||
"model.encoder.embed_positions.weight",
|
"model.encoder.embed_positions.weight",
|
||||||
"model.decoder.embed_positions.weight",
|
"model.decoder.embed_positions.weight",
|
||||||
]
|
]
|
||||||
|
@ -188,7 +188,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> BatchEncoding:
|
) -> BatchEncoding:
|
||||||
if max_length is None:
|
if max_length is None:
|
||||||
max_length = self.max_len
|
max_length = self.model_max_length
|
||||||
self.set_src_lang_special_tokens(src_lang)
|
self.set_src_lang_special_tokens(src_lang)
|
||||||
model_inputs: BatchEncoding = self(
|
model_inputs: BatchEncoding = self(
|
||||||
src_texts,
|
src_texts,
|
||||||
|
@ -185,7 +185,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> BatchEncoding:
|
) -> BatchEncoding:
|
||||||
if max_length is None:
|
if max_length is None:
|
||||||
max_length = self.max_len
|
max_length = self.model_max_length
|
||||||
self.set_src_lang_special_tokens(src_lang)
|
self.set_src_lang_special_tokens(src_lang)
|
||||||
model_inputs: BatchEncoding = self(
|
model_inputs: BatchEncoding = self(
|
||||||
src_texts,
|
src_texts,
|
||||||
|
@ -677,7 +677,7 @@ class MobileBertPreTrainedModel(PreTrainedModel):
|
|||||||
pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST
|
pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
load_tf_weights = load_tf_weights_in_mobilebert
|
load_tf_weights = load_tf_weights_in_mobilebert
|
||||||
base_model_prefix = "mobilebert"
|
base_model_prefix = "mobilebert"
|
||||||
authorized_missing_keys = [r"position_ids"]
|
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||||
|
|
||||||
def _init_weights(self, module):
|
def _init_weights(self, module):
|
||||||
""" Initialize the weights """
|
""" Initialize the weights """
|
||||||
@ -933,7 +933,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
MobileBert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
|
MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
|
||||||
`next sentence prediction (classification)` head.
|
`next sentence prediction (classification)` head.
|
||||||
""",
|
""",
|
||||||
MOBILEBERT_START_DOCSTRING,
|
MOBILEBERT_START_DOCSTRING,
|
||||||
@ -1054,7 +1054,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
|
|||||||
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
|
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
|
||||||
class MobileBertForMaskedLM(MobileBertPreTrainedModel):
|
class MobileBertForMaskedLM(MobileBertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1350,7 +1350,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
|
class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@ -1545,7 +1545,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
class MobileBertForTokenClassification(MobileBertPreTrainedModel):
|
class MobileBertForTokenClassification(MobileBertPreTrainedModel):
|
||||||
|
|
||||||
authorized_unexpected_keys = [r"pooler"]
|
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
@ -975,7 +975,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
MobileBert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
|
MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
|
||||||
`next sentence prediction (classification)` head.
|
`next sentence prediction (classification)` head.
|
||||||
""",
|
""",
|
||||||
MOBILEBERT_START_DOCSTRING,
|
MOBILEBERT_START_DOCSTRING,
|
||||||
@ -1030,7 +1030,7 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
|
|||||||
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
|
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
|
||||||
class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -1297,7 +1297,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
|
|||||||
)
|
)
|
||||||
class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAnsweringLoss):
|
class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAnsweringLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
@ -1529,7 +1529,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
|
|||||||
)
|
)
|
||||||
class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenClassificationLoss):
|
class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenClassificationLoss):
|
||||||
|
|
||||||
authorized_missing_keys = [r"pooler"]
|
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super().__init__(config, *inputs, **kwargs)
|
super().__init__(config, *inputs, **kwargs)
|
||||||
|
@ -60,6 +60,8 @@ class MT5Config(PretrainedConfig):
|
|||||||
testing).
|
testing).
|
||||||
feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`):
|
feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`):
|
||||||
Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`.
|
Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`.
|
||||||
|
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||||
"""
|
"""
|
||||||
model_type = "mt5"
|
model_type = "mt5"
|
||||||
keys_to_ignore_at_inference = ["past_key_values"]
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
@ -79,6 +81,7 @@ class MT5Config(PretrainedConfig):
|
|||||||
initializer_factor=1.0,
|
initializer_factor=1.0,
|
||||||
feed_forward_proj="gated-gelu",
|
feed_forward_proj="gated-gelu",
|
||||||
is_encoder_decoder=True,
|
is_encoder_decoder=True,
|
||||||
|
use_cache=True,
|
||||||
tokenizer_class="T5Tokenizer",
|
tokenizer_class="T5Tokenizer",
|
||||||
tie_word_embeddings=False,
|
tie_word_embeddings=False,
|
||||||
pad_token_id=0,
|
pad_token_id=0,
|
||||||
@ -109,6 +112,7 @@ class MT5Config(PretrainedConfig):
|
|||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.feed_forward_proj = feed_forward_proj
|
self.feed_forward_proj = feed_forward_proj
|
||||||
|
self.use_cache = use_cache
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user