mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 01:23:56 +08:00
Compare commits
26 Commits
update_ssh
...
v4.0.1
Author | SHA1 | Date | |
---|---|---|---|
e20ac6611d | |||
0351c5acef | |||
28d3ccd04a | |||
c328cb872d | |||
e6399320c6 | |||
c781171dfa | |||
ab597c84d1 | |||
e72b4fafeb | |||
dc0dea3e42 | |||
4d8f5d12b3 | |||
710b0108c9 | |||
87199dee00 | |||
68879472c4 | |||
8c5a2b8e36 | |||
911d8486e8 | |||
563efd36ab | |||
5a63232a8a | |||
e46890f699 | |||
df2cdd84f3 | |||
c6e2876cd4 | |||
5580cccd81 | |||
ccc4f64044 | |||
3408e6ffcd | |||
a986b02e49 | |||
b6ec39e41f | |||
f80ea27f80 |
@ -77,7 +77,7 @@ jobs:
|
||||
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tf-cpu,torch,testing]
|
||||
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
|
||||
- save_cache:
|
||||
key: v0.4-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -103,7 +103,7 @@ jobs:
|
||||
- v0.4-torch-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,torch,testing]
|
||||
- run: pip install .[sklearn,torch,testing,sentencepiece]
|
||||
- save_cache:
|
||||
key: v0.4-torch-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -129,7 +129,7 @@ jobs:
|
||||
- v0.4-tf-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tf-cpu,testing]
|
||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
|
||||
- save_cache:
|
||||
key: v0.4-tf-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -155,7 +155,7 @@ jobs:
|
||||
- v0.4-flax-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: sudo pip install .[flax,sklearn,torch,testing]
|
||||
- run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
|
||||
- save_cache:
|
||||
key: v0.4-flax-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -181,7 +181,7 @@ jobs:
|
||||
- v0.4-torch-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,torch,testing]
|
||||
- run: pip install .[sklearn,torch,testing,sentencepiece]
|
||||
- save_cache:
|
||||
key: v0.4-torch-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -207,7 +207,7 @@ jobs:
|
||||
- v0.4-tf-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tf-cpu,testing]
|
||||
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
|
||||
- save_cache:
|
||||
key: v0.4-tf-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
@ -231,7 +231,7 @@ jobs:
|
||||
- v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[ja,testing]
|
||||
- run: pip install .[ja,testing,sentencepiece]
|
||||
- run: python -m unidic download
|
||||
- save_cache:
|
||||
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
|
||||
@ -258,7 +258,7 @@ jobs:
|
||||
- v0.4-torch_examples-{{ checksum "setup.py" }}
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,torch,testing]
|
||||
- run: pip install .[sklearn,torch,sentencepiece,testing]
|
||||
- run: pip install -r examples/requirements.txt
|
||||
- save_cache:
|
||||
key: v0.4-torch_examples-{{ checksum "setup.py" }}
|
||||
@ -324,7 +324,7 @@ jobs:
|
||||
- v0.4-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install isort
|
||||
- run: pip install .[tf,torch,flax,quality]
|
||||
- run: pip install .[all,quality]
|
||||
- save_cache:
|
||||
key: v0.4-code_quality-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
|
@ -52,4 +52,5 @@ deploy_doc "4b3ee9c" v3.1.0
|
||||
deploy_doc "3ebb1b3" v3.2.0
|
||||
deploy_doc "0613f05" v3.3.1
|
||||
deploy_doc "eb0e0ce" v3.4.0
|
||||
deploy_doc "818878d" # v3.5.1 Latest stable release
|
||||
deploy_doc "818878d" v3.5.1
|
||||
deploy_doc "28d3ccd" # v4.0.1 Latest stable release
|
||||
|
1
.github/conda/build.sh
vendored
Normal file
1
.github/conda/build.sh
vendored
Normal file
@ -0,0 +1 @@
|
||||
$PYTHON setup.py install # Python command to install the script.
|
48
.github/conda/meta.yaml
vendored
Normal file
48
.github/conda/meta.yaml
vendored
Normal file
@ -0,0 +1,48 @@
|
||||
{% set name = "transformers" %}
|
||||
|
||||
package:
|
||||
name: "{{ name|lower }}"
|
||||
version: "{{ TRANSFORMERS_VERSION }}"
|
||||
|
||||
source:
|
||||
path: ../../
|
||||
|
||||
build:
|
||||
noarch: python
|
||||
|
||||
requirements:
|
||||
host:
|
||||
- python
|
||||
- pip
|
||||
- numpy
|
||||
- dataclasses
|
||||
- packaging
|
||||
- filelock
|
||||
- requests
|
||||
- tqdm >=4.27
|
||||
- sacremoses
|
||||
- regex !=2019.12.17
|
||||
- protobuf
|
||||
- tokenizers ==0.9.4
|
||||
run:
|
||||
- python
|
||||
- numpy
|
||||
- dataclasses
|
||||
- packaging
|
||||
- filelock
|
||||
- requests
|
||||
- tqdm >=4.27
|
||||
- sacremoses
|
||||
- regex !=2019.12.17
|
||||
- protobuf
|
||||
- tokenizers ==0.9.4
|
||||
|
||||
test:
|
||||
imports:
|
||||
- transformers
|
||||
|
||||
about:
|
||||
home: https://huggingface.co
|
||||
license: Apache License 2.0
|
||||
license_file: LICENSE
|
||||
summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
|
43
.github/workflows/release-conda.yml
vendored
Normal file
43
.github/workflows/release-conda.yml
vendored
Normal file
@ -0,0 +1,43 @@
|
||||
name: Release - Conda
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- v*
|
||||
|
||||
env:
|
||||
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
|
||||
|
||||
jobs:
|
||||
build_and_package:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -l {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Install miniconda
|
||||
uses: conda-incubator/setup-miniconda@v2
|
||||
with:
|
||||
auto-update-conda: true
|
||||
auto-activate-base: false
|
||||
activate-environment: "build-transformers"
|
||||
channels: huggingface
|
||||
|
||||
- name: Setup conda env
|
||||
run: |
|
||||
conda install -c defaults anaconda-client conda-build
|
||||
|
||||
- name: Extract version
|
||||
run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
|
||||
|
||||
- name: Build conda packages
|
||||
run: |
|
||||
conda info
|
||||
conda build .github/conda
|
||||
|
||||
- name: Upload to Anaconda
|
||||
run: anaconda upload `conda build .github/conda --output` --force
|
1
.github/workflows/self-scheduled.yml
vendored
1
.github/workflows/self-scheduled.yml
vendored
@ -9,7 +9,6 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- ci_*
|
||||
- framework-agnostic-tokenizers
|
||||
repository_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *"
|
||||
|
20
README.md
20
README.md
@ -137,14 +137,16 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
|
||||
|
||||
## Installation
|
||||
|
||||
### With pip
|
||||
|
||||
This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.
|
||||
|
||||
You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
|
||||
|
||||
First, create a virtual environment with the version of Python you're going to use and activate it.
|
||||
|
||||
Then, you will need to install one of, or both, TensorFlow 2.0 and PyTorch.
|
||||
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
|
||||
Then, you will need to install at least one of TensorFlow 2.0, PyTorch or Flax.
|
||||
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform and/or [Flax installation page](https://github.com/google/flax#quick-install).
|
||||
|
||||
When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
|
||||
|
||||
@ -154,6 +156,18 @@ pip install transformers
|
||||
|
||||
If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).
|
||||
|
||||
### With conda
|
||||
|
||||
Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
|
||||
|
||||
🤗 Transformers can be installed using conda as follows:
|
||||
|
||||
```shell script
|
||||
conda install -c huggingface transformers
|
||||
```
|
||||
|
||||
Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
|
||||
|
||||
## Models architectures
|
||||
|
||||
🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/transformers/model_summary.html) for a high-level summary of each them):
|
||||
@ -197,6 +211,8 @@ ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/
|
||||
1. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
|
||||
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||
|
||||
To cehck if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
|
||||
|
||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||
|
||||
|
||||
|
@ -2,6 +2,15 @@
|
||||
|
||||
/* Colab dropdown */
|
||||
|
||||
table.center-aligned-table td {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
table.center-aligned-table th {
|
||||
text-align: center;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
.colab-dropdown {
|
||||
position: relative;
|
||||
display: inline-block;
|
||||
|
@ -1,14 +1,15 @@
|
||||
// These two things need to be updated at each release for the version selector.
|
||||
// Last stable version
|
||||
const stableVersion = "v3.5.0"
|
||||
const stableVersion = "v4.0.1"
|
||||
// Dictionary doc folder to label
|
||||
const versionMapping = {
|
||||
"master": "master",
|
||||
"": "v3.5.0/v3.5.1",
|
||||
"": "v4.0.0/v4.0.1",
|
||||
"v3.5.1": "v3.5.0/v3.5.1",
|
||||
"v3.4.0": "v3.4.0",
|
||||
"v3.3.1": "v3.3.0/v3.3.1",
|
||||
"v3.2.0": "v3.2.0",
|
||||
"v3.1.0": "v3.1.0 (stable)",
|
||||
"v3.1.0": "v3.1.0",
|
||||
"v3.0.2": "v3.0.0/v3.0.1/v3.0.2",
|
||||
"v2.11.0": "v2.11.0",
|
||||
"v2.10.0": "v2.10.0",
|
||||
|
@ -26,7 +26,7 @@ author = u'huggingface'
|
||||
# The short X.Y version
|
||||
version = u''
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = u'3.5.0'
|
||||
release = u'4.0.1'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
@ -35,6 +35,8 @@ Choose the right framework for every part of a model's lifetime:
|
||||
- Move a single model between TF2.0/PyTorch frameworks at will
|
||||
- Seamlessly pick the right framework for training, evaluation, production
|
||||
|
||||
Experimental support for Flax with a few models right now, expected to grow in the coming months.
|
||||
|
||||
Contents
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@ -44,7 +46,7 @@ The documentation is organized in five parts:
|
||||
and a glossary.
|
||||
- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
|
||||
- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
|
||||
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
|
||||
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in
|
||||
transformers model
|
||||
- The three last section contain the documentation of each public class and function, grouped in:
|
||||
|
||||
@ -52,8 +54,8 @@ The documentation is organized in five parts:
|
||||
- **MODELS** for the classes and functions related to each model implemented in the library.
|
||||
- **INTERNAL HELPERS** for the classes and functions we use internally.
|
||||
|
||||
The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
|
||||
conversion utilities for the following models:
|
||||
The library currently contains PyTorch, Tensorflow and Flax implementations, pretrained model weights, usage scripts
|
||||
and conversion utilities for the following models:
|
||||
|
||||
..
|
||||
This list is updated automatically from the README with `make fix-copies`. Do not update manually!
|
||||
@ -166,6 +168,97 @@ conversion utilities for the following models:
|
||||
34. `Other community models <https://huggingface.co/models>`__, contributed by the `community
|
||||
<https://huggingface.co/users>`__.
|
||||
|
||||
|
||||
.. _bigtable:
|
||||
|
||||
The table below represents the current support in the library for each of those models, whether they have a Python
|
||||
tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in PyTorch,
|
||||
TensorFlow and/or Flax.
|
||||
|
||||
..
|
||||
This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
|
||||
|
||||
.. rst-class:: center-aligned-table
|
||||
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Model | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
|
||||
+=============================+================+================+=================+====================+==============+
|
||||
| ALBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| BART | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| BERT | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Blenderbot | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| DPR | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| DeBERTa | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| DistilBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| ELECTRA | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| LayoutLM | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Marian | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| MobileBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| OpenAI GPT | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Pegasus | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| RAG | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| T5 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| XLMProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| XLNet | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| mBART | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
| mT5 | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Get started
|
||||
|
@ -12,9 +12,10 @@ must install it from source.
|
||||
## Installation with pip
|
||||
|
||||
First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
|
||||
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available)
|
||||
and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific
|
||||
install command for your platform.
|
||||
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available),
|
||||
[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or
|
||||
[Flax installation page](https://github.com/google/flax#quick-install)
|
||||
regarding the specific install command for your platform.
|
||||
|
||||
When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
|
||||
|
||||
@ -34,6 +35,12 @@ or 🤗 Transformers and TensorFlow 2.0 in one line with:
|
||||
pip install transformers[tf-cpu]
|
||||
```
|
||||
|
||||
or 🤗 Transformers and Flax in one line with:
|
||||
|
||||
```bash
|
||||
pip install transformers[flax]
|
||||
```
|
||||
|
||||
To check 🤗 Transformers is properly installed, run the following command:
|
||||
|
||||
```bash
|
||||
@ -66,19 +73,32 @@ python -c "from transformers import pipeline; print(pipeline('sentiment-analysis
|
||||
|
||||
to check 🤗 Transformers is properly installed.
|
||||
|
||||
|
||||
## With conda
|
||||
|
||||
Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
|
||||
|
||||
🤗 Transformers can be installed using conda as follows:
|
||||
|
||||
```
|
||||
conda install -c huggingface transformers
|
||||
```
|
||||
|
||||
Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
|
||||
|
||||
## Caching models
|
||||
|
||||
This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
|
||||
`cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
|
||||
folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the PyTorch
|
||||
cache home followed by ``/transformers/`` (even if you don't have PyTorch installed). This is (by order of priority):
|
||||
folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
|
||||
Face cache home followed by ``/transformers/``. This is (by order of priority):
|
||||
|
||||
* shell environment variable ``TORCH_HOME``
|
||||
* shell environment variable ``XDG_CACHE_HOME`` + ``/torch/``
|
||||
* default: ``~/.cache/torch/``
|
||||
* shell environment variable ``HF_HOME``
|
||||
* shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
|
||||
* default: ``~/.cache/huggingface/``
|
||||
|
||||
So if you don't have any specific environment variable set, the cache directory will be at
|
||||
``~/.cache/torch/transformers/``.
|
||||
``~/.cache/huggingface/transformers/``.
|
||||
|
||||
**Note:** If you have set a shell environment variable for one of the predecessors of this library
|
||||
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
|
||||
@ -97,6 +117,6 @@ You should check out our [swift-coreml-transformers](https://github.com/huggingf
|
||||
It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
|
||||
`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
|
||||
|
||||
At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch or
|
||||
At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
|
||||
TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
|
||||
hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
|
||||
|
@ -1,5 +1,170 @@
|
||||
# Migrating from previous packages
|
||||
|
||||
## Migrating from transformers `v3.x` to `v4.x`
|
||||
|
||||
A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
|
||||
expected changes:
|
||||
|
||||
#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
|
||||
|
||||
The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set.
|
||||
|
||||
This introduces two breaking changes:
|
||||
- The handling of overflowing tokens between the python and rust tokenizers is different.
|
||||
- The rust tokenizers do not accept integers in the encoding methods.
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=textclassification#tokenclassificationpipeline).
|
||||
- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
|
||||
|
||||
In version `v3.x`:
|
||||
```py
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```py
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
|
||||
```
|
||||
|
||||
#### 2. SentencePiece is removed from the required dependencies
|
||||
|
||||
The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
|
||||
|
||||
This includes the **slow** versions of:
|
||||
- `XLNetTokenizer`
|
||||
- `AlbertTokenizer`
|
||||
- `CamembertTokenizer`
|
||||
- `MBartTokenizer`
|
||||
- `PegasusTokenizer`
|
||||
- `T5Tokenizer`
|
||||
- `ReformerTokenizer`
|
||||
- `XLMRobertaTokenizer`
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
|
||||
|
||||
In version `v3.x`:
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```bash
|
||||
pip install transformers[sentencepiece]
|
||||
```
|
||||
or
|
||||
```bash
|
||||
pip install transformers sentencepiece
|
||||
```
|
||||
#### 3. The architecture of the repo has been updated so that each model resides in its folder
|
||||
|
||||
The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
|
||||
|
||||
This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers.
|
||||
|
||||
In version `v3.x`:
|
||||
```bash
|
||||
from transformers.modeling_bert import BertLayer
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```bash
|
||||
from transformers.models.bert.modeling_bert import BertLayer
|
||||
```
|
||||
|
||||
#### 4. Switching the `return_dict` argument to `True` by default
|
||||
|
||||
The [`return_dict` argument](https://huggingface.co/transformers/main_classes/output.html) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
|
||||
|
||||
This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
|
||||
|
||||
##### How to obtain the same behavior as v3.x in v4.x
|
||||
|
||||
In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
|
||||
|
||||
In version `v3.x`:
|
||||
```bash
|
||||
model = BertModel.from_pretrained("bert-base-cased")
|
||||
outputs = model(**inputs)
|
||||
```
|
||||
to obtain the same in version `v4.x`:
|
||||
```bash
|
||||
model = BertModel.from_pretrained("bert-base-cased")
|
||||
outputs = model(**inputs, return_dict=False)
|
||||
```
|
||||
or
|
||||
```bash
|
||||
model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
|
||||
outputs = model(**inputs)
|
||||
```
|
||||
|
||||
#### 5. Removed some deprecated attributes
|
||||
|
||||
Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
|
||||
|
||||
Here is a list of these attributes/methods/arguments and what their replacements should be:
|
||||
|
||||
In several models, the labels become consistent with the other models:
|
||||
- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
|
||||
- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
|
||||
- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
|
||||
- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
|
||||
- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
|
||||
- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
|
||||
- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
|
||||
- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
|
||||
|
||||
In several models, the caching mechanism becomes consistent with the other models:
|
||||
- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
|
||||
- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
|
||||
- `past` becomes `past_key_values` in all CTRL models.
|
||||
- `past` becomes `past_key_values` in all GPT-2 models.
|
||||
|
||||
Regarding the tokenizer classes:
|
||||
- The tokenizer attribute `max_len` becomes `model_max_length`.
|
||||
- The tokenizer attribute `return_lengths` becomes `return_length`.
|
||||
- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
|
||||
|
||||
Regarding the `Trainer` class:
|
||||
- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
|
||||
- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
|
||||
- The `Trainer` attribute `data_collator` should be a callable.
|
||||
- The `Trainer` method `_log` is deprecated in favor of `log`.
|
||||
- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
|
||||
- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
|
||||
- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
|
||||
- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
|
||||
|
||||
Regarding the `TFTrainer` class:
|
||||
- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
|
||||
- The `Trainer` method `_log` is deprecated in favor of `log`.
|
||||
- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
|
||||
- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
|
||||
- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
|
||||
|
||||
Regarding the `TrainerArgument` class:
|
||||
- The `TrainerArgument` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
|
||||
|
||||
Regarding the Transfo-XL model:
|
||||
- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
|
||||
- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
|
||||
|
||||
Regarding pipelines:
|
||||
- The `FillMaskPipeline` argument `topk` becomes `top_k`.
|
||||
|
||||
|
||||
|
||||
## Migrating from pytorch-transformers to 🤗 Transformers
|
||||
|
||||
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
|
||||
|
@ -10,7 +10,7 @@ Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Ali
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By
|
||||
*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
|
||||
warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
|
||||
benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
|
||||
Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
|
||||
|
@ -20,8 +20,8 @@ disentangled attention mechanism, where each word is represented using two vecto
|
||||
position, respectively, and the attention weights among words are computed using disentangled matrices on their
|
||||
contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
|
||||
predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
|
||||
of model pre-training and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half
|
||||
of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
|
||||
of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
|
||||
the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
|
||||
(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
|
||||
pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
|
||||
|
||||
|
@ -18,9 +18,9 @@ operating these large models in on-the-edge and/or under constrained computation
|
||||
remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
|
||||
model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
|
||||
counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
|
||||
knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by
|
||||
knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
|
||||
40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
|
||||
biases learned by larger models during pre-training, we introduce a triple loss combining language modeling,
|
||||
biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
|
||||
distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
|
||||
demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
|
||||
study.*
|
||||
|
@ -5,7 +5,7 @@ Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
|
||||
intorduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
|
||||
introduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
|
||||
Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
@ -12,14 +12,14 @@ identify which tokens were replaced by the generator in the sequence.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Masked language modeling (MLM) pre-training methods such as BERT corrupt the input by replacing some tokens with
|
||||
[MASK] and then train a model to reconstruct the original tokens. While they produce good results when transferred to
|
||||
*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
|
||||
and then train a model to reconstruct the original tokens. While they produce good results when transferred to
|
||||
downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
|
||||
more sample-efficient pre-training task called replaced token detection. Instead of masking the input, our approach
|
||||
more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
|
||||
corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
|
||||
of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
|
||||
predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
|
||||
demonstrate this new pre-training task is more efficient than MLM because the task is defined over all input tokens
|
||||
demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
|
||||
rather than just the small subset that was masked out. As a result, the contextual representations learned by our
|
||||
approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
|
||||
particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
|
||||
|
@ -19,7 +19,7 @@ representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018;
|
||||
heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
|
||||
Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
|
||||
classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
|
||||
time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified evaluation
|
||||
time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
|
||||
protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
|
||||
community for further reproducible experiments in French NLP.*
|
||||
|
||||
|
@ -14,7 +14,7 @@ The abstract from the paper is the following:
|
||||
*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
|
||||
semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
|
||||
labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
|
||||
perform adequately. We demonstrate that large gains on these tasks can be realized by generative pre-training of a
|
||||
perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
|
||||
language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
|
||||
contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
|
||||
effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
|
||||
|
@ -6,19 +6,19 @@ Overview
|
||||
|
||||
The LayoutLM model was proposed in the paper `LayoutLM: Pre-training of Text and Layout for Document Image
|
||||
Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
|
||||
Ming Zhou. It's a simple but effective pre-training method of text and layout for document image understanding and
|
||||
Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
|
||||
information extraction tasks, such as form understanding and receipt understanding.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
|
||||
widespread use of pre-training models for NLP applications, they almost exclusively focus on text-level manipulation,
|
||||
widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
|
||||
while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
|
||||
the \textbf{LayoutLM} to jointly model interactions between text and layout information across scanned document images,
|
||||
which is beneficial for a great number of real-world document image understanding tasks such as information extraction
|
||||
from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into
|
||||
LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single
|
||||
framework for document-level pre-training. It achieves new state-of-the-art results in several downstream tasks,
|
||||
framework for document-level pretraining. It achieves new state-of-the-art results in several downstream tasks,
|
||||
including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image
|
||||
classification (from 93.07 to 94.42).*
|
||||
|
||||
|
@ -19,7 +19,7 @@ Encoder Representations from Transformers) framework to learn these vision-and-l
|
||||
build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
|
||||
encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
|
||||
semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
|
||||
pre-training tasks: masked language modeling, masked object prediction (feature regression and label classification),
|
||||
pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
|
||||
cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
|
||||
cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
|
||||
results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
|
||||
|
@ -13,7 +13,7 @@ The MBart model was presented in `Multilingual Denoising Pre-training for Neural
|
||||
Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
|
||||
According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
|
||||
corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete
|
||||
corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
|
||||
sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
|
||||
on the encoder, decoder, or reconstructing parts of the text.
|
||||
|
||||
|
@ -17,7 +17,7 @@ the next token.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
|
||||
*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
|
||||
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
|
||||
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
|
||||
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
|
||||
@ -25,7 +25,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
|
||||
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
|
||||
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
|
||||
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
|
||||
state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
|
||||
state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
|
||||
|
||||
The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
|
||||
|
||||
|
@ -17,7 +17,7 @@ The abstract from the paper is the following:
|
||||
task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
|
||||
has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
|
||||
transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
|
||||
text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer
|
||||
text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
|
||||
approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
|
||||
with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
|
||||
summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
|
||||
|
@ -19,7 +19,7 @@ just the next token. Its architecture is identical to ProhpetNet, but the model
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
|
||||
*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
|
||||
self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
|
||||
the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
|
||||
n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
|
||||
@ -27,7 +27,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
|
||||
overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
|
||||
dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
|
||||
abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
|
||||
state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
|
||||
state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
|
||||
|
||||
The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
|
||||
|
||||
|
@ -527,10 +527,10 @@ Pegasus
|
||||
<https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
|
||||
|
||||
Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
|
||||
two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pre-training
|
||||
two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
|
||||
objective, called Gap Sentence Generation (GSG).
|
||||
|
||||
* MLM: encoder input tokens are randomely replaced by a mask tokens and have to be predicted by the encoder (like in
|
||||
* MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
|
||||
BERT)
|
||||
* GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
|
||||
causal mask to hide the future words like a regular auto-regressive transformer decoder.
|
||||
@ -609,7 +609,7 @@ MT5
|
||||
`mT5: A massively multilingual pre-trained text-to-text transformer <https://arxiv.org/abs/2010.11934>`_, Linting Xue
|
||||
et al.
|
||||
|
||||
The model architecture is same as T5. mT5's pre-training objective includes T5's self-supervised training, but not T5's
|
||||
The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
|
||||
supervised training. mT5 is trained on 101 languages.
|
||||
|
||||
The library provides a version of this model for conditional generation.
|
||||
@ -630,8 +630,8 @@ MBart
|
||||
`Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu,
|
||||
Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
|
||||
|
||||
The model architecture and pre-training objective is same as BART, but MBart is trained on 25 languages and is intended
|
||||
for supervised and unsupervised machine translation. MBart is one of the first methods for pre-training a complete
|
||||
The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
|
||||
for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
|
||||
sequence-to-sequence model by denoising full texts in multiple languages,
|
||||
|
||||
The library provides a version of this model for conditional generation.
|
||||
@ -658,7 +658,7 @@ ProphetNet
|
||||
`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
|
||||
Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
|
||||
|
||||
ProphetNet introduces a novel *sequence-to-sequence* pre-training objective, called *future n-gram prediction*. In
|
||||
ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
|
||||
future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
|
||||
time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
|
||||
to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
|
||||
@ -683,8 +683,8 @@ XLM-ProphetNet
|
||||
`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
|
||||
Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
|
||||
|
||||
XLM-ProphetNet's model architecture and pre-training objective is same as ProphetNet, but XLM-ProphetNet was
|
||||
pre-trained on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
|
||||
XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
|
||||
on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
|
||||
|
||||
The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
|
||||
versions for headline generation and question generation, respectively.
|
||||
|
@ -109,7 +109,7 @@ XLM-RoBERTa
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong gains
|
||||
over previously released multi-lingual models like mBERT or XLM on downstream taks like classification, sequence
|
||||
over previously released multi-lingual models like mBERT or XLM on downstream tasks like classification, sequence
|
||||
labeling and question answering.
|
||||
|
||||
Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:
|
||||
|
@ -62,7 +62,7 @@ sliding the context window so that the model has more context when making each p
|
||||
This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more
|
||||
favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good
|
||||
practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by
|
||||
1 token a time. This allows computation to procede much faster while still giving the model a large context to make
|
||||
1 token a time. This allows computation to proceed much faster while still giving the model a large context to make
|
||||
predictions at each step.
|
||||
|
||||
Example: Calculating perplexity with GPT-2 in 🤗 Transformers
|
||||
|
@ -305,7 +305,7 @@ Language modeling is the task of fitting a model to a corpus, which can be domai
|
||||
transformer-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling,
|
||||
GPT-2 with causal language modeling.
|
||||
|
||||
Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
|
||||
Language modeling can be useful outside of pretraining as well, for example to shift the model distribution to be
|
||||
domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset or
|
||||
on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
|
||||
|
||||
|
@ -25,7 +25,7 @@ class PlotArguments:
|
||||
)
|
||||
plot_along_batch: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
|
||||
metadata={"help": "Whether to plot along batch size or sequence length. Defaults to sequence length."},
|
||||
)
|
||||
is_time: bool = field(
|
||||
default=False,
|
||||
|
@ -17,7 +17,7 @@ This folder contains the original code used to train Distil* as well as examples
|
||||
|
||||
## What is Distil*
|
||||
|
||||
Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||
Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distilled-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||
|
||||
We have applied the same method to other Transformer architectures and released the weights:
|
||||
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
|
||||
@ -57,7 +57,7 @@ Here are the results on the *test* sets for 6 of the languages available in XNLI
|
||||
|
||||
This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
|
||||
|
||||
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0).
|
||||
**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breaking changes compared to v1.1.0).
|
||||
|
||||
|
||||
## How to use DistilBERT
|
||||
@ -111,7 +111,7 @@ python scripts/binarized_data.py \
|
||||
--dump_file data/binarized_text
|
||||
```
|
||||
|
||||
Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurrences of each tokens in the data:
|
||||
Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smooths the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurrences of each tokens in the data:
|
||||
|
||||
```bash
|
||||
python scripts/token_counts.py \
|
||||
@ -173,7 +173,7 @@ python -m torch.distributed.launch \
|
||||
--token_counts data/token_counts.bert-base-uncased.pickle
|
||||
```
|
||||
|
||||
**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
|
||||
**Tips:** Starting distilled training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
|
||||
|
||||
Happy distillation!
|
||||
|
||||
|
@ -188,7 +188,7 @@ class Distiller:
|
||||
|
||||
def prepare_batch_mlm(self, batch):
|
||||
"""
|
||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
|
||||
Prepare the batch: from the token_ids and the lengths, compute the attention mask and the masked label for MLM.
|
||||
|
||||
Input:
|
||||
------
|
||||
@ -200,7 +200,7 @@ class Distiller:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels. There is a -100 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
@ -253,7 +253,7 @@ class Distiller:
|
||||
|
||||
def prepare_batch_clm(self, batch):
|
||||
"""
|
||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
|
||||
Prepare the batch: from the token_ids and the lengths, compute the attention mask and the labels for CLM.
|
||||
|
||||
Input:
|
||||
------
|
||||
|
@ -86,7 +86,7 @@ if __name__ == "__main__":
|
||||
compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
|
||||
|
||||
print(f"N layers selected for distillation: {std_idx}")
|
||||
print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
|
||||
print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
|
||||
|
||||
print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
|
||||
print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
|
||||
torch.save(compressed_sd, args.dump_checkpoint)
|
||||
|
@ -21,7 +21,7 @@ You can also have a look at this fun *Explain Like I'm Five* introductory [slide
|
||||
|
||||
One promise of extreme pruning is to obtain extremely small models that can be easily sent (and stored) on edge devices. By setting weights to 0., we reduce the amount of information we need to store, and thus decreasing the memory size. We are able to obtain extremely sparse fine-pruned models with movement pruning: ~95% of the dense performance with ~5% of total remaining weights in the BERT encoder.
|
||||
|
||||
In [this notebook](https://github.com/huggingface/transformers/blob/master/examples/movement-pruning/Saving_PruneBERT.ipynb), we showcase how we can leverage standard tools that exist out-of-the-box to efficiently store an extremely sparse question answering model (only 6% of total remaining weights in the encoder). We are able to reduce the memory size of the encoder **from the 340MB (the orignal dense BERT) to 11MB**, without any additional training of the model (every operation is performed *post fine-pruning*). It is sufficiently small to store it on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical) 📎!
|
||||
In [this notebook](https://github.com/huggingface/transformers/blob/master/examples/movement-pruning/Saving_PruneBERT.ipynb), we showcase how we can leverage standard tools that exist out-of-the-box to efficiently store an extremely sparse question answering model (only 6% of total remaining weights in the encoder). We are able to reduce the memory size of the encoder **from the 340MB (the original dense BERT) to 11MB**, without any additional training of the model (every operation is performed *post fine-pruning*). It is sufficiently small to store it on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical) 📎!
|
||||
|
||||
While movement pruning does not directly optimize for memory footprint (but rather the number of non-null weights), we hypothetize that further memory compression ratios can be achieved with specific quantization aware trainings (see for instance [Q8BERT](https://arxiv.org/abs/1910.06188), [And the Bit Goes Down](https://arxiv.org/abs/1907.05686) or [Quant-Noise](https://arxiv.org/abs/2004.07320)).
|
||||
|
||||
|
@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Binarizers take a (real value) matrice as input and produce a binary (values in {0,1}) mask of the same shape.
|
||||
Binarizers take a (real value) matrix as input and produce a binary (values in {0,1}) mask of the same shape.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
@ -3,7 +3,8 @@
|
||||
python finetune_trainer.py \
|
||||
--learning_rate=3e-5 \
|
||||
--fp16 \
|
||||
--do_train --do_eval --do_predict --evaluate_during_training \
|
||||
--do_train --do_eval --do_predict \
|
||||
--evaluation_strategy steps \
|
||||
--predict_with_generate \
|
||||
--n_val 1000 \
|
||||
"$@"
|
||||
|
@ -5,7 +5,8 @@ export TPU_NUM_CORES=8
|
||||
python xla_spawn.py --num_cores $TPU_NUM_CORES \
|
||||
finetune_trainer.py \
|
||||
--learning_rate=3e-5 \
|
||||
--do_train --do_eval --evaluate_during_training \
|
||||
--do_train --do_eval \
|
||||
--evaluation_strategy steps \
|
||||
--prediction_loss_only \
|
||||
--n_val 1000 \
|
||||
"$@"
|
||||
|
@ -16,7 +16,8 @@ python finetune_trainer.py \
|
||||
--num_train_epochs=6 \
|
||||
--save_steps 3000 --eval_steps 3000 \
|
||||
--max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
|
||||
--do_train --do_eval --do_predict --evaluate_during_training\
|
||||
--do_train --do_eval --do_predict \
|
||||
--evaluation_strategy steps \
|
||||
--predict_with_generate --logging_first_step \
|
||||
--task translation --label_smoothing 0.1 \
|
||||
"$@"
|
||||
|
@ -17,7 +17,8 @@ python xla_spawn.py --num_cores $TPU_NUM_CORES \
|
||||
--save_steps 500 --eval_steps 500 \
|
||||
--logging_first_step --logging_steps 200 \
|
||||
--max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
|
||||
--do_train --do_eval --evaluate_during_training \
|
||||
--do_train --do_eval \
|
||||
--evaluation_strategy steps \
|
||||
--prediction_loss_only \
|
||||
--task translation --label_smoothing 0.1 \
|
||||
"$@"
|
||||
|
@ -19,6 +19,7 @@ python finetune_trainer.py \
|
||||
--save_steps 3000 --eval_steps 3000 \
|
||||
--logging_first_step \
|
||||
--max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
|
||||
--do_train --do_eval --do_predict --evaluate_during_training \
|
||||
--do_train --do_eval --do_predict \
|
||||
--evaluation_strategy steps \
|
||||
--predict_with_generate --sortish_sampler \
|
||||
"$@"
|
||||
|
@ -15,7 +15,8 @@ python finetune_trainer.py \
|
||||
--sortish_sampler \
|
||||
--num_train_epochs 6 \
|
||||
--save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
|
||||
--do_train --do_eval --do_predict --evaluate_during_training \
|
||||
--predict_with_generate --logging_first_step
|
||||
--do_train --do_eval --do_predict \
|
||||
--evaluation_strategy steps \
|
||||
--predict_with_generate --logging_first_step \
|
||||
--task translation \
|
||||
"$@"
|
||||
|
@ -4,7 +4,7 @@ language: sv
|
||||
|
||||
# Swedish BERT Models
|
||||
|
||||
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on aproximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
||||
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on approximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
||||
|
||||
The following three models are currently available:
|
||||
|
||||
@ -86,7 +86,7 @@ for token in nlp(text):
|
||||
print(l)
|
||||
```
|
||||
|
||||
Which should result in the following (though less cleanly formated):
|
||||
Which should result in the following (though less cleanly formatted):
|
||||
|
||||
```python
|
||||
[ { 'word': 'Engelbert', 'score': 0.99..., 'entity': 'PRS'},
|
||||
@ -104,7 +104,7 @@ Which should result in the following (though less cleanly formated):
|
||||
|
||||
### ALBERT base
|
||||
|
||||
The easisest way to do this is, again, using Huggingface Transformers:
|
||||
The easiest way to do this is, again, using Huggingface Transformers:
|
||||
|
||||
```python
|
||||
from transformers import AutoModel,AutoTokenizer
|
||||
|
@ -4,7 +4,7 @@ language: sv
|
||||
|
||||
# Swedish BERT Models
|
||||
|
||||
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on aproximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
||||
The National Library of Sweden / KBLab releases three pretrained language models based on BERT and ALBERT. The models are trained on approximately 15-20GB of text (200M sentences, 3000M tokens) from various sources (books, news, government publications, swedish wikipedia and internet forums) aiming to provide a representative BERT model for Swedish text. A more complete description will be published later on.
|
||||
|
||||
The following three models are currently available:
|
||||
|
||||
@ -86,7 +86,7 @@ for token in nlp(text):
|
||||
print(l)
|
||||
```
|
||||
|
||||
Which should result in the following (though less cleanly formated):
|
||||
Which should result in the following (though less cleanly formatted):
|
||||
|
||||
```python
|
||||
[ { 'word': 'Engelbert', 'score': 0.99..., 'entity': 'PRS'},
|
||||
@ -104,7 +104,7 @@ Which should result in the following (though less cleanly formated):
|
||||
|
||||
### ALBERT base
|
||||
|
||||
The easisest way to do this is, again, using Huggingface Transformers:
|
||||
The easiest way to do this is, again, using Huggingface Transformers:
|
||||
|
||||
```python
|
||||
from transformers import AutoModel,AutoTokenizer
|
||||
|
@ -4,7 +4,7 @@ tags:
|
||||
---
|
||||
|
||||
## CS224n SQuAD2.0 Project Dataset
|
||||
The goal of this model is to save CS224n students GPU time when establising
|
||||
The goal of this model is to save CS224n students GPU time when establishing
|
||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||
The training set used to fine-tune this model is the same as
|
||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||
|
@ -4,7 +4,7 @@ tags:
|
||||
---
|
||||
|
||||
## CS224n SQuAD2.0 Project Dataset
|
||||
The goal of this model is to save CS224n students GPU time when establising
|
||||
The goal of this model is to save CS224n students GPU time when establishing
|
||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||
The training set used to fine-tune this model is the same as
|
||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||
|
@ -1,5 +1,5 @@
|
||||
## CS224n SQuAD2.0 Project Dataset
|
||||
The goal of this model is to save CS224n students GPU time when establising
|
||||
The goal of this model is to save CS224n students GPU time when establishing
|
||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||
The training set used to fine-tune this model is the same as
|
||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||
|
@ -1,5 +1,5 @@
|
||||
## CS224n SQuAD2.0 Project Dataset
|
||||
The goal of this model is to save CS224n students GPU time when establising
|
||||
The goal of this model is to save CS224n students GPU time when establishing
|
||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||
The training set used to fine-tune this model is the same as
|
||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the downstream task (Intent Prediction) - Dataset 📚
|
||||
|
||||
Dataset ID: ```event2Mind``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
||||
Dataset ID: ```event2Mind``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||
|
||||
Dataset ID: ```squad``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
||||
Dataset ID: ```squad``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||
|
||||
Dataset ID: ```squad_v2``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
||||
Dataset ID: ```squad_v2``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the Dataset 📚
|
||||
|
||||
Dataset ID: ```wikisql``` from [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||
Dataset ID: ```wikisql``` from [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the Dataset 📚
|
||||
|
||||
Dataset ID: ```wikisql``` from [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||
Dataset ID: ```wikisql``` from [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the downstream task (Question Paraphrasing) - Dataset 📚❓↔️❓
|
||||
|
||||
Dataset ID: ```quora``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
||||
Dataset ID: ```quora``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||
|
||||
Dataset ID: ```squad``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
||||
Dataset ID: ```squad``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
|
||||
|
||||
Dataset ID: ```squad_v2``` from [HugginFace/NLP](https://github.com/huggingface/nlp)
|
||||
Dataset ID: ```squad_v2``` from [Huggingface/NLP](https://github.com/huggingface/nlp)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
|
||||
|
||||
## Details of the Dataset 📚
|
||||
|
||||
Dataset ID: ```wikisql``` from [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||
Dataset ID: ```wikisql``` from [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
|
||||
|
||||
| Dataset | Split | # samples |
|
||||
| -------- | ----- | --------- |
|
||||
|
2
setup.py
2
setup.py
@ -119,7 +119,7 @@ extras["dev"] = extras["all"] + extras["testing"] + extras["quality"] + extras["
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="4.0.0-rc-1",
|
||||
version="4.0.1",
|
||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||
author_email="thomas@huggingface.co",
|
||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||
|
@ -2,7 +2,7 @@
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
__version__ = "4.0.0-rc-1"
|
||||
__version__ = "4.0.1"
|
||||
|
||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||
# default Python logging output behavior when present.
|
||||
@ -98,6 +98,7 @@ from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
|
||||
from .models.auto import (
|
||||
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
CONFIG_MAPPING,
|
||||
MODEL_NAMES_MAPPING,
|
||||
TOKENIZER_MAPPING,
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
@ -876,6 +877,7 @@ else:
|
||||
|
||||
|
||||
if is_flax_available():
|
||||
from .models.auto import FLAX_MODEL_MAPPING, FlaxAutoModel
|
||||
from .models.bert import FlaxBertModel
|
||||
from .models.roberta import FlaxRobertaModel
|
||||
else:
|
||||
|
@ -55,8 +55,6 @@ class PretrainedConfig(object):
|
||||
Whether or not the model should return all hidden-states.
|
||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should returns all attentions.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain
|
||||
tuple.
|
||||
@ -168,7 +166,6 @@ class PretrainedConfig(object):
|
||||
self.return_dict = kwargs.pop("return_dict", True)
|
||||
self.output_hidden_states = kwargs.pop("output_hidden_states", False)
|
||||
self.output_attentions = kwargs.pop("output_attentions", False)
|
||||
self.use_cache = kwargs.pop("use_cache", True) # Not used by all models
|
||||
self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models
|
||||
self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
|
||||
self.pruned_heads = kwargs.pop("pruned_heads", {})
|
||||
|
@ -229,7 +229,7 @@ class LineByLineWithSOPTextDataset(Dataset):
|
||||
# to `block_size` anyways, so short sequences are generally wasted
|
||||
# computation. However, we *sometimes*
|
||||
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
|
||||
# sequences to minimize the mismatch between pre-training and fine-tuning.
|
||||
# sequences to minimize the mismatch between pretraining and fine-tuning.
|
||||
# The `target_seq_length` is just a rough target however, whereas
|
||||
# `block_size` is a hard limit.
|
||||
target_seq_length = max_num_tokens
|
||||
@ -425,7 +425,7 @@ class TextDatasetForNextSentencePrediction(Dataset):
|
||||
# to `block_size` anyways, so short sequences are generally wasted
|
||||
# computation. However, we *sometimes*
|
||||
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
|
||||
# sequences to minimize the mismatch between pre-training and fine-tuning.
|
||||
# sequences to minimize the mismatch between pretraining and fine-tuning.
|
||||
# The `target_seq_length` is just a rough target however, whereas
|
||||
# `block_size` is a hard limit.
|
||||
target_seq_length = max_num_tokens
|
||||
|
@ -203,8 +203,28 @@ except ImportError:
|
||||
_tokenizers_available = False
|
||||
|
||||
|
||||
default_cache_path = os.path.join(torch_cache_home, "transformers")
|
||||
old_default_cache_path = os.path.join(torch_cache_home, "transformers")
|
||||
# New default cache, shared with the Datasets library
|
||||
hf_cache_home = os.path.expanduser(
|
||||
os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
|
||||
)
|
||||
default_cache_path = os.path.join(hf_cache_home, "transformers")
|
||||
|
||||
# Onetime move from the old location to the new one if no ENV variable has been set.
|
||||
if (
|
||||
os.path.isdir(old_default_cache_path)
|
||||
and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
|
||||
and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
|
||||
and "TRANSFORMERS_CACHE" not in os.environ
|
||||
):
|
||||
logger.warn(
|
||||
"In Transformers v4.0.0, the default path to cache downloaded models changed from "
|
||||
"'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
|
||||
"and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
|
||||
"'~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should "
|
||||
"only see this message once."
|
||||
)
|
||||
shutil.move(old_default_cache_path, default_cache_path)
|
||||
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
|
||||
PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
|
||||
|
@ -38,6 +38,7 @@ class TFGenerationMixin:
|
||||
|
||||
def _use_cache(self, outputs, use_cache):
|
||||
"""During generation, decide whether to pass the `past` variable to the next forward pass."""
|
||||
use_cache = getattr(self.config, "use_cache", False)
|
||||
if len(outputs) <= 1 or use_cache is False:
|
||||
return False
|
||||
if hasattr(self.config, "mem_len") and self.config.mem_len == 0:
|
||||
@ -194,7 +195,6 @@ class TFGenerationMixin:
|
||||
min_length = min_length if min_length is not None else self.config.min_length
|
||||
do_sample = do_sample if do_sample is not None else self.config.do_sample
|
||||
early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
num_beams = num_beams if num_beams is not None else self.config.num_beams
|
||||
temperature = temperature if temperature is not None else self.config.temperature
|
||||
top_k = top_k if top_k is not None else self.config.top_k
|
||||
@ -224,7 +224,6 @@ class TFGenerationMixin:
|
||||
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
|
||||
assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
|
||||
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
|
||||
assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
|
||||
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
|
||||
assert temperature > 0, "`temperature` should be strictly positive."
|
||||
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
|
||||
|
@ -462,7 +462,6 @@ class GenerationMixin:
|
||||
pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
|
||||
bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
|
||||
eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
|
||||
if input_ids is None:
|
||||
# init `input_ids` with bos_token_id
|
||||
|
@ -2,6 +2,7 @@
|
||||
import math
|
||||
import os
|
||||
|
||||
from .trainer_utils import EvaluationStrategy
|
||||
from .utils import logging
|
||||
|
||||
|
||||
@ -212,13 +213,13 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR
|
||||
# Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
|
||||
if isinstance(
|
||||
kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
|
||||
) and (not trainer.args.do_eval or not trainer.args.evaluate_during_training):
|
||||
) and (not trainer.args.do_eval or trainer.args.evaluation_strategy == EvaluationStrategy.NO):
|
||||
raise RuntimeError(
|
||||
"You are using {cls} as a scheduler but you haven't enabled evaluation during training. "
|
||||
"This means your trials will not report intermediate results to Ray Tune, and "
|
||||
"can thus not be stopped early or used to exploit other trials parameters. "
|
||||
"If this is what you want, do not use {cls}. If you would like to use {cls}, "
|
||||
"make sure you pass `do_eval=True` and `evaluate_during_training=True` in the "
|
||||
"make sure you pass `do_eval=True` and `evaluation_strategy='steps'` in the "
|
||||
"Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__)
|
||||
)
|
||||
|
||||
|
@ -39,7 +39,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
|
||||
return tuple with:
|
||||
|
||||
- pytorch model weight name
|
||||
- transpose: boolean indicating wether TF2.0 and PyTorch weights matrices are transposed with regards to each
|
||||
- transpose: boolean indicating whether TF2.0 and PyTorch weights matrices are transposed with regards to each
|
||||
other
|
||||
"""
|
||||
tf_name = tf_name.replace(":0", "") # device ids
|
||||
@ -164,9 +164,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
||||
if allow_missing_keys:
|
||||
missing_keys.append(name)
|
||||
continue
|
||||
elif tf_model.authorized_missing_keys is not None:
|
||||
elif tf_model._keys_to_ignore_on_load_missing is not None:
|
||||
# authorized missing keys don't have to be loaded
|
||||
if any(re.search(pat, name) is not None for pat in tf_model.authorized_missing_keys):
|
||||
if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing):
|
||||
continue
|
||||
|
||||
raise AttributeError("{} not found in PyTorch model".format(name))
|
||||
@ -209,11 +209,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
||||
|
||||
unexpected_keys = list(all_pytorch_weights)
|
||||
|
||||
if tf_model.authorized_missing_keys is not None:
|
||||
for pat in tf_model.authorized_missing_keys:
|
||||
if tf_model._keys_to_ignore_on_load_missing is not None:
|
||||
for pat in tf_model._keys_to_ignore_on_load_missing:
|
||||
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
||||
if tf_model.authorized_unexpected_keys is not None:
|
||||
for pat in tf_model.authorized_unexpected_keys:
|
||||
if tf_model._keys_to_ignore_on_load_unexpected is not None:
|
||||
for pat in tf_model._keys_to_ignore_on_load_unexpected:
|
||||
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
||||
|
||||
if len(unexpected_keys) > 0:
|
||||
|
@ -343,15 +343,15 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
||||
:class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
|
||||
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
|
||||
derived classes of the same architecture adding modules on top of the base model.
|
||||
- **authorized_missing_keys** (:obj:`List[str]`, `optional`) -- A list of re pattern of tensor names to ignore
|
||||
from the model when loading the model weights (and avoid unnecessary warnings).
|
||||
- **authorized_unexpected_keys** (:obj:`List[str]`, `optional`) -- A list of re pattern of tensor names to
|
||||
ignore from the weights when loading the model weights (and avoid unnecessary warnings).
|
||||
"""
|
||||
config_class = None
|
||||
base_model_prefix = ""
|
||||
authorized_missing_keys = None
|
||||
authorized_unexpected_keys = None
|
||||
# a list of re pattern of tensor names to ignore from the model when loading the model weights
|
||||
# (and avoid unnecessary warnings).
|
||||
_keys_to_ignore_on_load_missing = None
|
||||
# a list of re pattern of tensor names to ignore from the weights when loading the model weights
|
||||
# (and avoid unnecessary warnings).
|
||||
_keys_to_ignore_on_load_unexpected = None
|
||||
|
||||
@property
|
||||
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
||||
@ -742,12 +742,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
|
||||
|
||||
model(model.dummy_inputs, training=False) # Make sure restore ops are run
|
||||
|
||||
if cls.authorized_missing_keys is not None:
|
||||
for pat in cls.authorized_missing_keys:
|
||||
if cls._keys_to_ignore_on_load_missing is not None:
|
||||
for pat in cls._keys_to_ignore_on_load_missing:
|
||||
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
||||
|
||||
if cls.authorized_unexpected_keys is not None:
|
||||
for pat in cls.authorized_unexpected_keys:
|
||||
if cls._keys_to_ignore_on_load_unexpected is not None:
|
||||
for pat in cls._keys_to_ignore_on_load_unexpected:
|
||||
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
||||
|
||||
if len(unexpected_keys) > 0:
|
||||
|
@ -404,17 +404,18 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
|
||||
|
||||
- **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
|
||||
derived classes of the same architecture adding modules on top of the base model.
|
||||
- **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore
|
||||
when loading the model (and avoid unnecessary warnings).
|
||||
- **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore when saving the
|
||||
model (useful for keys that aren't trained, but which are deterministic)
|
||||
|
||||
"""
|
||||
config_class = None
|
||||
base_model_prefix = ""
|
||||
authorized_missing_keys = None
|
||||
authorized_unexpected_keys = None
|
||||
keys_to_never_save = None
|
||||
# a list of re pattern of tensor names to ignore from the model when loading the model weights
|
||||
# (and avoid unnecessary warnings).
|
||||
_keys_to_ignore_on_load_missing = None
|
||||
# a list of re pattern of tensor names to ignore from the weights when loading the model weights
|
||||
# (and avoid unnecessary warnings).
|
||||
_keys_to_ignore_on_load_unexpected = None
|
||||
# a list of of tensor names to ignore when saving the model (useful for keys that aren't
|
||||
# trained, but which are deterministic)
|
||||
_keys_to_ignore_on_save = None
|
||||
|
||||
@property
|
||||
def dummy_inputs(self) -> Dict[str, torch.Tensor]:
|
||||
@ -719,8 +720,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
|
||||
state_dict = model_to_save.state_dict()
|
||||
|
||||
# Handle the case where some state_dict keys shouldn't be saved
|
||||
if self.keys_to_never_save is not None:
|
||||
state_dict = {k: v for k, v in state_dict.items() if k not in self.keys_to_never_save}
|
||||
if self._keys_to_ignore_on_save is not None:
|
||||
state_dict = {k: v for k, v in state_dict.items() if k not in self._keys_to_ignore_on_save}
|
||||
|
||||
# If we save using the predefined names, we can load using `from_pretrained`
|
||||
output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
|
||||
@ -1034,12 +1035,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
|
||||
|
||||
# Some models may have keys that are not in the state by design, removing them before needlessly warning
|
||||
# the user.
|
||||
if cls.authorized_missing_keys is not None:
|
||||
for pat in cls.authorized_missing_keys:
|
||||
if cls._keys_to_ignore_on_load_missing is not None:
|
||||
for pat in cls._keys_to_ignore_on_load_missing:
|
||||
missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
|
||||
|
||||
if cls.authorized_unexpected_keys is not None:
|
||||
for pat in cls.authorized_unexpected_keys:
|
||||
if cls._keys_to_ignore_on_load_unexpected is not None:
|
||||
for pat in cls._keys_to_ignore_on_load_unexpected:
|
||||
unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
|
||||
|
||||
if len(unexpected_keys) > 0:
|
||||
|
@ -214,6 +214,7 @@ class AlbertEmbeddings(nn.Module):
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
|
||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||
@ -265,6 +266,11 @@ class AlbertAttention(nn.Module):
|
||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.pruned_heads = set()
|
||||
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores
|
||||
def transpose_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
@ -459,7 +465,7 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = AlbertConfig
|
||||
base_model_prefix = "albert"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights."""
|
||||
@ -705,7 +711,7 @@ class AlbertModel(AlbertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
|
||||
Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
|
||||
`sentence order prediction (classification)` head.
|
||||
""",
|
||||
ALBERT_START_DOCSTRING,
|
||||
@ -851,7 +857,7 @@ class AlbertSOPHead(nn.Module):
|
||||
)
|
||||
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1021,7 +1027,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||
)
|
||||
class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1110,7 +1116,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
|
||||
)
|
||||
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -768,7 +768,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Albert Model with two heads on top for pre-training: a `masked language modeling` head and a `sentence order
|
||||
Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
|
||||
prediction` (classification) head.
|
||||
""",
|
||||
ALBERT_START_DOCSTRING,
|
||||
@ -843,7 +843,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
|
||||
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
|
||||
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -1013,7 +1013,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
|
||||
)
|
||||
class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
|
||||
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -1100,7 +1100,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
|
||||
)
|
||||
class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):
|
||||
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
@ -2,8 +2,8 @@
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
from ...file_utils import is_tf_available, is_torch_available
|
||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig
|
||||
from ...file_utils import is_flax_available, is_tf_available, is_torch_available
|
||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
|
||||
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
|
||||
|
||||
|
||||
@ -57,3 +57,6 @@ if is_tf_available():
|
||||
TFAutoModelForTokenClassification,
|
||||
TFAutoModelWithLMHead,
|
||||
)
|
||||
|
||||
if is_flax_available():
|
||||
from .modeling_flax_auto import FLAX_MODEL_MAPPING, FlaxAutoModel
|
||||
|
@ -36,7 +36,7 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
|
||||
for key, value, in pretrained_map.items()
|
||||
)
|
||||
|
||||
MODEL_MAPPING = OrderedDict(
|
||||
FLAX_MODEL_MAPPING = OrderedDict(
|
||||
[
|
||||
(RobertaConfig, FlaxRobertaModel),
|
||||
(BertConfig, FlaxBertModel),
|
||||
@ -79,13 +79,13 @@ class FlaxAutoModel(object):
|
||||
model = FlaxAutoModel.from_config(config)
|
||||
# E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
"""
|
||||
for config_class, model_class in MODEL_MAPPING.items():
|
||||
for config_class, model_class in FLAX_MODEL_MAPPING.items():
|
||||
if isinstance(config, config_class):
|
||||
return model_class(config)
|
||||
raise ValueError(
|
||||
f"Unrecognized configuration class {config.__class__} "
|
||||
f"for this kind of FlaxAutoModel: {cls.__name__}.\n"
|
||||
f"Model type should be one of {', '.join(c.__name__ for c in MODEL_MAPPING.keys())}."
|
||||
f"Model type should be one of {', '.join(c.__name__ for c in FLAX_MODEL_MAPPING.keys())}."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -173,11 +173,11 @@ class FlaxAutoModel(object):
|
||||
if not isinstance(config, PretrainedConfig):
|
||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
for config_class, model_class in MODEL_MAPPING.items():
|
||||
for config_class, model_class in FLAX_MODEL_MAPPING.items():
|
||||
if isinstance(config, config_class):
|
||||
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
|
||||
raise ValueError(
|
||||
f"Unrecognized configuration class {config.__class__} "
|
||||
f"for this kind of FlaxAutoModel: {cls.__name__}.\n"
|
||||
f"Model type should be one of {', '.join(c.__name__ for c in MODEL_MAPPING.keys())}"
|
||||
f"Model type should be one of {', '.join(c.__name__ for c in FLAX_MODEL_MAPPING.keys())}"
|
||||
)
|
||||
|
@ -72,6 +72,7 @@ from .configuration_auto import (
|
||||
MarianConfig,
|
||||
MBartConfig,
|
||||
MobileBertConfig,
|
||||
MT5Config,
|
||||
OpenAIGPTConfig,
|
||||
PegasusConfig,
|
||||
ProphetNetConfig,
|
||||
@ -173,6 +174,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
||||
[
|
||||
(RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
|
||||
(T5Config, (T5Tokenizer, T5TokenizerFast)),
|
||||
(MT5Config, (T5Tokenizer, T5TokenizerFast)),
|
||||
(MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
|
||||
(DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
|
||||
(AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
|
||||
@ -340,7 +342,13 @@ class AutoTokenizer:
|
||||
if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
|
||||
return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
else:
|
||||
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
if tokenizer_class_py is not None:
|
||||
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
"This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
|
||||
"in order to use this tokenizer."
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
"Unrecognized configuration class {} to build an AutoTokenizer.\n"
|
||||
|
@ -108,6 +108,8 @@ class BartConfig(PretrainedConfig):
|
||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
|
||||
:obj:`True` for `bart-large-cnn`.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
"""
|
||||
model_type = "bart"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
@ -134,9 +136,6 @@ class BartConfig(PretrainedConfig):
|
||||
classifier_dropout=0.0,
|
||||
num_labels=3,
|
||||
is_encoder_decoder=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
normalize_before=False,
|
||||
add_final_layer_norm=False,
|
||||
do_blenderbot_90_layernorm=False,
|
||||
@ -145,6 +144,10 @@ class BartConfig(PretrainedConfig):
|
||||
static_position_embeddings=False,
|
||||
add_bias_logits=False,
|
||||
force_bos_token_to_be_generated=False,
|
||||
use_cache=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
**common_kwargs
|
||||
):
|
||||
r"""
|
||||
@ -208,6 +211,8 @@ class BartConfig(PretrainedConfig):
|
||||
|
||||
self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
|
||||
|
||||
self.use_cache = use_cache
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
@ -946,7 +946,7 @@ class BartModel(PretrainedBartModel):
|
||||
)
|
||||
class BartForConditionalGeneration(PretrainedBartModel):
|
||||
base_model_prefix = "model"
|
||||
authorized_missing_keys = [r"final_logits_bias", r"encoder\.version", r"decoder\.version"]
|
||||
_keys_to_ignore_on_load_missing = [r"final_logits_bias", r"encoder\.version", r"decoder\.version"]
|
||||
|
||||
def __init__(self, config: BartConfig):
|
||||
super().__init__(config)
|
||||
|
@ -1020,10 +1020,10 @@ class TFBartModel(TFPretrainedBartModel):
|
||||
)
|
||||
class TFBartForConditionalGeneration(TFPretrainedBartModel):
|
||||
base_model_prefix = "model"
|
||||
authorized_missing_keys = [
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"final_logits_bias",
|
||||
]
|
||||
authorized_unexpected_keys = [
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"model.encoder.embed_tokens.weight",
|
||||
r"model.decoder.embed_tokens.weight",
|
||||
]
|
||||
|
@ -178,6 +178,7 @@ class BertEmbeddings(nn.Module):
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||
if input_ids is not None:
|
||||
@ -222,6 +223,10 @@ class BertSelfAttention(nn.Module):
|
||||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||
|
||||
def transpose_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
@ -598,7 +603,7 @@ class BertPreTrainedModel(PreTrainedModel):
|
||||
config_class = BertConfig
|
||||
load_tf_weights = load_tf_weights_in_bert
|
||||
base_model_prefix = "bert"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
@ -864,7 +869,7 @@ class BertModel(BertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next
|
||||
Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
|
||||
sentence prediction (classification)` head.
|
||||
""",
|
||||
BERT_START_DOCSTRING,
|
||||
@ -969,8 +974,8 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
)
|
||||
class BertLMHeadModel(BertPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
authorized_missing_keys = [r"position_ids", r"predictions.decoder.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1087,8 +1092,8 @@ class BertLMHeadModel(BertPreTrainedModel):
|
||||
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
||||
class BertForMaskedLM(BertPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
authorized_missing_keys = [r"position_ids", r"predictions.decoder.bias"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1469,7 +1474,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
)
|
||||
class BertForTokenClassification(BertPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1560,7 +1565,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
)
|
||||
class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -183,6 +183,10 @@ class FlaxBertAttention(nn.Module):
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, hidden_state, attention_mask):
|
||||
# Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
|
||||
# FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
|
||||
# with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
|
||||
attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
|
||||
self_att = nn.attention.SelfAttention(num_heads=self.num_heads, qkv_features=self.head_size, name="self")(
|
||||
hidden_state, attention_mask
|
||||
)
|
||||
|
@ -91,7 +91,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
|
||||
class TFBertPreTrainingLoss:
|
||||
"""
|
||||
Loss function suitable for BERT-like pre-training, that is, the task of pretraining a language model by combining
|
||||
Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining
|
||||
NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
|
||||
computation.
|
||||
"""
|
||||
@ -842,7 +842,7 @@ class TFBertModel(TFBertPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Bert Model with two heads on top as done during the pre-training:
|
||||
Bert Model with two heads on top as done during the pretraining:
|
||||
a `masked language modeling` head and a `next sentence prediction (classification)` head.
|
||||
""",
|
||||
BERT_START_DOCSTRING,
|
||||
@ -938,8 +938,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
|
||||
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
||||
class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -1023,8 +1023,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
|
||||
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -1416,8 +1416,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
|
||||
)
|
||||
class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationLoss):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -1502,8 +1502,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
|
||||
)
|
||||
class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
@ -173,7 +173,7 @@ class BertGenerationPreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = BertGenerationConfig
|
||||
base_model_prefix = "bert"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
|
@ -80,7 +80,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
|
||||
normalization (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
||||
Whether or not to apply a normalization preprocess.
|
||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
|
||||
.. note::
|
||||
|
||||
|
@ -61,6 +61,9 @@ class CTRLConfig(PretrainedConfig):
|
||||
The epsilon to use in the layer normalization layers
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
|
||||
|
||||
Examples::
|
||||
|
||||
@ -98,6 +101,7 @@ class CTRLConfig(PretrainedConfig):
|
||||
summary_activation=None,
|
||||
summary_proj_to_labels=True,
|
||||
summary_first_dropout=0.1,
|
||||
use_cache=True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
@ -119,6 +123,7 @@ class CTRLConfig(PretrainedConfig):
|
||||
self.summary_activation = summary_activation
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
self.use_cache = use_cache
|
||||
|
||||
@property
|
||||
def max_position_embeddings(self):
|
||||
|
@ -756,7 +756,7 @@ class DebertaPreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = DebertaConfig
|
||||
base_model_prefix = "deberta"
|
||||
authorized_missing_keys = ["position_ids"]
|
||||
_keys_to_ignore_on_load_missing = ["position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
|
||||
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
|
||||
<https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
|
||||
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
||||
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
|
||||
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
|
||||
|
||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||
|
@ -71,6 +71,13 @@ class DPRConfig(PretrainedConfig):
|
||||
The epsilon used by the layer normalization layers.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
|
||||
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
|
||||
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
|
||||
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
|
||||
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
|
||||
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
|
||||
<https://arxiv.org/abs/2009.13658>`__.
|
||||
projection_dim (:obj:`int`, `optional`, defaults to 0):
|
||||
Dimension of the projection for the context and question encoders. If it is set to zero (default), then no
|
||||
projection is done.
|
||||
@ -93,6 +100,7 @@ class DPRConfig(PretrainedConfig):
|
||||
layer_norm_eps=1e-12,
|
||||
pad_token_id=0,
|
||||
gradient_checkpointing=False,
|
||||
position_embedding_type="absolute",
|
||||
projection_dim: int = 0,
|
||||
**kwargs
|
||||
):
|
||||
@ -112,3 +120,4 @@ class DPRConfig(PretrainedConfig):
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.projection_dim = projection_dim
|
||||
self.position_embedding_type = position_embedding_type
|
||||
|
@ -279,7 +279,7 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "ctx_encoder"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def init_weights(self):
|
||||
self.ctx_encoder.init_weights()
|
||||
@ -294,7 +294,7 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "question_encoder"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def init_weights(self):
|
||||
self.question_encoder.init_weights()
|
||||
@ -309,7 +309,7 @@ class DPRPretrainedReader(PreTrainedModel):
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "span_predictor"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def init_weights(self):
|
||||
self.span_predictor.encoder.init_weights()
|
||||
|
@ -165,6 +165,7 @@ class ElectraEmbeddings(nn.Module):
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
|
||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||
@ -211,6 +212,10 @@ class ElectraSelfAttention(nn.Module):
|
||||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||
|
||||
def transpose_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
@ -544,8 +549,8 @@ class ElectraPreTrainedModel(PreTrainedModel):
|
||||
config_class = ElectraConfig
|
||||
load_tf_weights = load_tf_weights_in_electra
|
||||
base_model_prefix = "electra"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
authorized_unexpected_keys = [r"electra\.embeddings_project\.weight", r"electra\.embeddings_project\.bias"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"electra\.embeddings_project\.weight", r"electra\.embeddings_project\.bias"]
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
|
||||
def _init_weights(self, module):
|
||||
@ -867,8 +872,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Electra model with a binary classification head on top as used during pre-training for identifying generated
|
||||
tokens.
|
||||
Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
|
||||
|
||||
It is recommended to load the discriminator checkpoint into that model.
|
||||
""",
|
||||
|
@ -734,8 +734,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Electra model with a binary classification head on top as used during pre-training for identifying generated
|
||||
tokens.
|
||||
Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
|
||||
|
||||
Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model
|
||||
of the two to have the correct classification head to be used for this model.
|
||||
|
@ -109,6 +109,8 @@ class FSMTConfig(PretrainedConfig):
|
||||
early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`)
|
||||
Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop the beam
|
||||
search when at least ``num_beams`` sentences are finished per batch or not.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
|
||||
Examples::
|
||||
|
||||
@ -142,9 +144,6 @@ class FSMTConfig(PretrainedConfig):
|
||||
dropout=0.1,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
decoder_start_token_id=2,
|
||||
is_encoder_decoder=True,
|
||||
scale_embedding=True,
|
||||
@ -152,6 +151,10 @@ class FSMTConfig(PretrainedConfig):
|
||||
num_beams=5,
|
||||
length_penalty=1.0,
|
||||
early_stopping=False,
|
||||
use_cache=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
**common_kwargs
|
||||
):
|
||||
if "hidden_size" in common_kwargs:
|
||||
@ -196,6 +199,8 @@ class FSMTConfig(PretrainedConfig):
|
||||
self.activation_dropout = activation_dropout
|
||||
self.dropout = dropout
|
||||
|
||||
self.use_cache = use_cache
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
@ -951,7 +951,7 @@ class FSMTModel(PretrainedFSMTModel):
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
|
||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
@ -1005,11 +1005,11 @@ class FSMTModel(PretrainedFSMTModel):
|
||||
)
|
||||
class FSMTForConditionalGeneration(PretrainedFSMTModel):
|
||||
base_model_prefix = "model"
|
||||
authorized_missing_keys = [
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
]
|
||||
keys_to_never_save = [
|
||||
_keys_to_ignore_on_save = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
]
|
||||
|
@ -1181,7 +1181,7 @@ class TFFunnelModel(TFFunnelPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Funnel model with a binary classification head on top as used during pre-training for identifying generated tokens.
|
||||
Funnel model with a binary classification head on top as used during pretraining for identifying generated tokens.
|
||||
""",
|
||||
FUNNEL_START_DOCSTRING,
|
||||
)
|
||||
|
@ -104,6 +104,8 @@ class GPT2Config(PretrainedConfig):
|
||||
The dropout ratio to be used after the projection and activation.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
|
||||
Example::
|
||||
|
||||
@ -142,9 +144,10 @@ class GPT2Config(PretrainedConfig):
|
||||
summary_activation=None,
|
||||
summary_proj_to_labels=True,
|
||||
summary_first_dropout=0.1,
|
||||
gradient_checkpointing=False,
|
||||
use_cache=True,
|
||||
bos_token_id=50256,
|
||||
eos_token_id=50256,
|
||||
gradient_checkpointing=False,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
@ -168,6 +171,7 @@ class GPT2Config(PretrainedConfig):
|
||||
self.summary_first_dropout = summary_first_dropout
|
||||
self.summary_proj_to_labels = summary_proj_to_labels
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.use_cache = use_cache
|
||||
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
@ -685,7 +685,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -975,7 +975,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
GPT2_START_DOCSTRING,
|
||||
)
|
||||
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
|
||||
authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -146,6 +146,10 @@ class LayoutLMSelfAttention(nn.Module):
|
||||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
|
||||
|
||||
def transpose_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
@ -509,7 +513,7 @@ class LayoutLMPreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = LayoutLMConfig
|
||||
base_model_prefix = "layoutlm"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
|
@ -460,6 +460,7 @@ class LongformerEmbeddings(nn.Module):
|
||||
|
||||
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
||||
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
||||
|
||||
# End copy
|
||||
self.padding_idx = config.pad_token_id
|
||||
@ -1303,7 +1304,7 @@ class LongformerPreTrainedModel(PreTrainedModel):
|
||||
|
||||
config_class = LongformerConfig
|
||||
base_model_prefix = "longformer"
|
||||
authorized_missing_keys = [r"position_ids"]
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights """
|
||||
@ -1621,7 +1622,7 @@ class LongformerModel(LongformerPreTrainedModel):
|
||||
@add_start_docstrings("""Longformer Model with a `language modeling` head on top. """, LONGFORMER_START_DOCSTRING)
|
||||
class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1718,7 +1719,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
|
||||
)
|
||||
class LongformerForSequenceClassification(LongformerPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1827,7 +1828,7 @@ class LongformerClassificationHead(nn.Module):
|
||||
)
|
||||
class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
@ -1961,7 +1962,7 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
|
||||
)
|
||||
class LongformerForTokenClassification(LongformerPreTrainedModel):
|
||||
|
||||
authorized_unexpected_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
@ -1961,7 +1961,7 @@ class TFLongformerModel(TFLongformerPreTrainedModel):
|
||||
)
|
||||
class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModelingLoss):
|
||||
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -2048,7 +2048,7 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
|
||||
)
|
||||
class TFLongformerForQuestionAnswering(TFLongformerPreTrainedModel, TFQuestionAnsweringLoss):
|
||||
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -2199,7 +2199,7 @@ class TFLongformerClassificationHead(tf.keras.layers.Layer):
|
||||
)
|
||||
class TFLongformerForSequenceClassification(TFLongformerPreTrainedModel, TFSequenceClassificationLoss):
|
||||
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
@ -2443,7 +2443,7 @@ class TFLongformerForMultipleChoice(TFLongformerPreTrainedModel, TFMultipleChoic
|
||||
)
|
||||
class TFLongformerForTokenClassification(TFLongformerPreTrainedModel, TFTokenClassificationLoss):
|
||||
|
||||
authorized_missing_keys = [r"pooler"]
|
||||
_keys_to_ignore_on_load_missing = [r"pooler"]
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super().__init__(config, *inputs, **kwargs)
|
||||
|
@ -1013,7 +1013,7 @@ class LxmertModel(LxmertPreTrainedModel):
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""Lxmert Model with a specified pre-training head on top. """,
|
||||
"""Lxmert Model with a specified pretraining head on top. """,
|
||||
LXMERT_START_DOCSTRING,
|
||||
)
|
||||
class LxmertForPreTraining(LxmertPreTrainedModel):
|
||||
@ -1024,7 +1024,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
|
||||
self.num_qa_labels = config.num_qa_labels
|
||||
self.visual_loss_normalizer = config.visual_loss_normalizer
|
||||
|
||||
# Use of pre-training tasks
|
||||
# Use of pretraining tasks
|
||||
self.task_mask_lm = config.task_mask_lm
|
||||
self.task_obj_predict = config.task_obj_predict
|
||||
self.task_matched = config.task_matched
|
||||
|
@ -1139,7 +1139,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
|
||||
self.num_qa_labels = config.num_qa_labels
|
||||
self.visual_loss_normalizer = config.visual_loss_normalizer
|
||||
|
||||
# Use of pre-training tasks
|
||||
# Use of pretraining tasks
|
||||
self.task_mask_lm = config.task_mask_lm
|
||||
self.task_obj_predict = config.task_obj_predict
|
||||
self.task_matched = config.task_matched
|
||||
|
@ -47,11 +47,11 @@ class MarianMTModel(BartForConditionalGeneration):
|
||||
|
||||
"""
|
||||
config_class = MarianConfig
|
||||
authorized_missing_keys = [
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
]
|
||||
keys_to_never_save = [
|
||||
_keys_to_ignore_on_save = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
]
|
||||
|
@ -37,7 +37,7 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
@add_start_docstrings("Marian model for machine translation", START_DOCSTRING)
|
||||
class TFMarianMTModel(TFBartForConditionalGeneration):
|
||||
authorized_missing_keys = [
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"model.encoder.embed_positions.weight",
|
||||
r"model.decoder.embed_positions.weight",
|
||||
]
|
||||
|
@ -29,11 +29,11 @@ class MBartForConditionalGeneration(BartForConditionalGeneration):
|
||||
"""
|
||||
model_type = "mbart"
|
||||
config_class = MBartConfig
|
||||
authorized_missing_keys = [
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
]
|
||||
keys_to_never_save = [
|
||||
_keys_to_ignore_on_save = [
|
||||
"model.encoder.embed_positions.weight",
|
||||
"model.decoder.embed_positions.weight",
|
||||
]
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user