Release: v2.4.0

[Umberto] model shortcuts (#2661 )
* [Umberto] model shortcuts cc @loretoparisi @simonefrancia see #2485 * Ensure that tokenizers will be correctly configured
2025-10-21 01:23:56 +08:00 · 2020-01-31 09:40:32 -05:00 · 2020-01-30 21:05:53 -05:00 · 2020-01-30 20:05:04 -05:00 · 2020-01-30 19:26:59 -05:00 · 2020-01-30 19:26:59 -05:00
335 changed files with 80684 additions and 10479 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,29 +1,143 @@
 version: 2
 jobs:
-    build_py3:
+    run_tests_torch_and_tf:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        environment:
            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install .[sklearn,tf,torch,testing]
            - run: sudo pip install codecov pytest-cov
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
    run_all_tests_torch_and_tf:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        environment:
            OMP_NUM_THREADS: 1
            RUN_SLOW: yes
            RUN_CUSTOM_TOKENIZERS: yes
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install .[mecab,sklearn,tf,torch,testing]
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/
    run_tests_torch:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install .[sklearn,torch,testing]
            - run: sudo pip install codecov pytest-cov
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
    run_tests_tf:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install .[sklearn,tf,testing]
            - run: sudo pip install codecov pytest-cov
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        environment:
            RUN_CUSTOM_TOKENIZERS: yes
        steps:
            - checkout
            - run: sudo pip install .[mecab,testing]
            - run: python -m pytest -sv ./tests/test_tokenization_bert_japanese.py
    run_examples_torch:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        environment:
            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install .[sklearn,torch,testing]
            - run: sudo pip install -r examples/requirements.txt
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
    deploy_doc:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        steps:
            - add_ssh_keys:
                fingerprints:
                    - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
            - checkout
-            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install .[tf,torch,docs]
-            - run: sudo pip install pytest ftfy spacy
+            - run: ./.circleci/deploy.sh
-            - run: sudo python -m spacy download en
+    check_code_quality:
-            - run: python -m pytest -sv tests/
+        working_directory: ~/transformers
    build_py2:
        working_directory: ~/pytorch-pretrained-BERT
        docker:
-            - image: circleci/python:2.7
+            - image: circleci/python:3.6
        resource_class: medium
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install --progress-bar off .
+            # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
-            - run: sudo pip install pytest spacy
+            - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
-            - run: sudo pip install ftfy==4.4.3
+            - run: sudo pip install .[tf,torch,quality]
-            - run: sudo python -m spacy download en
+            - run: black --check --line-length 119 --target-version py35 examples templates tests src utils
-            - run: python -m pytest -sv tests/
+            - run: isort --check-only --recursive examples templates tests src utils
            - run: flake8 examples templates tests src utils
    check_repository_consistency:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: small
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install requests
            - run: python ./utils/link_tester.py
 workflow_filters: &workflow_filters
    filters:
        branches:
            only:
                - master
 workflows:
-  version: 2
+    version: 2
-  build_and_test:
+    build_and_test:
-    jobs:
+        jobs:
-      - build_py3
+            - check_code_quality
-      - build_py2
+            - check_repository_consistency
            - run_examples_torch
            - run_tests_custom_tokenizers
            - run_tests_torch_and_tf
            - run_tests_torch
            - run_tests_tf
            - deploy_doc: *workflow_filters
    run_slow_tests:
        triggers:
            - schedule:
                cron: "0 4 * * 1"
                filters:
                    branches:
                        only:
                            - master
        jobs:
            - run_all_tests_torch_and_tf
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@ -0,0 +1,27 @@
 cd docs
 function deploy_doc(){
 	echo "Creating doc at commit $1 and pushing to folder $2"
 	git checkout $1
 	if [ ! -z "$2" ] 
 	then
 		if [ -d "$dir/$2" ]; then
 			echo "Directory" $2 "already exists"
 		else
 			echo "Pushing version" $2
 			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
 		fi
 	else
 		echo "Pushing master"
 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 	fi
 }
 deploy_doc "master" 
 deploy_doc "b33a385" v1.0.0
 deploy_doc "fe02e45" v1.1.0
 deploy_doc "89fd345" v1.2.0
 deploy_doc "fc9faa8" v2.0.0
 deploy_doc "3ddce1d" v2.1.1
 deploy_doc "3616209" v2.2.0
 deploy_doc "d0f8b9a" v2.3.0
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,12 @@
 [run]
 source=transformers
 omit =
    # skip convertion scripts from testing for now
    */convert_*
    */__main__.py
 [report]
 exclude_lines =
    pragma: no cover
    raise
    except
    register_parameter
--- a/.github/ISSUE_TEMPLATE/---new-benchmark.md
+++ b/.github/ISSUE_TEMPLATE/---new-benchmark.md
@ -0,0 +1,22 @@
 ---
 name: "\U0001F5A5 New benchmark"
 about: Benchmark a part of this library and share your results
 title: "[Benchmark]"
 labels: ''
 assignees: ''
 ---
 # 🖥 Benchmarking `transformers`
 ## Benchmark
 Which part of `transformers` did you benchmark?
 ## Set-up
 What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
 ## Results
 Put your results here!
--- a/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@ -0,0 +1,20 @@
 ---
 name: "\U0001F31F New model addition"
 about: Submit a proposal/request to implement a new Transformer-based model
 title: ''
 labels: ''
 assignees: ''
 ---
 # 🌟 New model addition
 ## Model description
 <!-- Important information -->
 ## Open source status
 * [ ] the model implementation is available: (give details)
 * [ ] the model weights are available: (give details)
 * [ ] who are the authors: (mention them, if possible by @gh-username)
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@ -0,0 +1,50 @@
 ---
 name: "\U0001F41B Bug Report"
 about: Submit a bug report to help us improve transformers
 title: ''
 labels: ''
 assignees: ''
 ---
 # 🐛 Bug
 ## Information
 Model I am using (Bert, XLNet ...):
 Language I am using the model on (English, Chinese ...):
 The problem arises when using:
 * [ ] the official example scripts: (give details below)
 * [ ] my own modified scripts: (give details below)
 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
 * [ ] my own task or dataset: (give details below)
 ## To reproduce
 Steps to reproduce the behavior:
 1.
 2.
 3.
 <!-- If you have code snippets, error messages, stack traces please provide them here as well.
     Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
     Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->
 ## Expected behavior
 <!-- A clear and concise description of what you would expect to happen. -->
 ## Environment
 * OS:
 * Python version:
 * PyTorch version:
 * `transformers` version (or branch):
 * Using GPU ?
 * Distributed or parallel setup ?
 * Any other relevant information:
--- a/.github/ISSUE_TEMPLATE/feature-request.md
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@ -0,0 +1,25 @@
 ---
 name: "\U0001F680 Feature request"
 about: Submit a proposal/request for a new transformers feature
 title: ''
 labels: ''
 assignees: ''
 ---
 # 🚀 Feature request
 <!-- A clear and concise description of the feature proposal.
     Please provide a link to the paper and code in case they exist. -->
 ## Motivation
 <!-- Please outline the motivation for the proposal. Is your feature request
     related to a problem? e.g., I'm always frustrated when [...]. If this is related
     to another GitHub issue, please link here too. -->
 ## Your contribution
 <!-- Is there any way that you could help, e.g. by submitting a PR?
     Make sure to read the CONTRIBUTING.MD readme:
     https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md -->
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@ -0,0 +1,52 @@
 ---
 name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
 about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
 title: ''
 labels: ''
 assignees: ''
 ---
 # 📚 Migration
 ## Information
 <!-- Important information -->
 Model I am using (Bert, XLNet ...):
 Language I am using the model on (English, Chinese ...):
 The problem arises when using:
 * [ ] the official example scripts: (give details below)
 * [ ] my own modified scripts: (give details below)
 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
 * [ ] my own task or dataset: (give details below)
 ## Details
 <!-- A clear and concise description of the migration issue.
    If you have code snippets, please provide it here as well.
    Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
    Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
    -->
 ## Environment
 * OS:
 * Python version:
 * PyTorch version:
 * `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):
 * `transformers` version (or branch):
 * Using GPU?
 * Distributed or parallel setup?
 * Any other relevant information:
 ## Checklist
 - [ ] I have read the migration guide in the readme.
 ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
  [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))
 - [ ] I checked if a related official extension example runs on my machine.
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@ -0,0 +1,29 @@
 ---
 name: "❓ Questions & Help"
 about: Post your general questions on Stack Overflow tagged huggingface-transformers
 title: ''
 labels: ''
 assignees: ''
 ---
 # ❓ Questions & Help
 <!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
     new models and benchmarks, and migration questions. For all other questions,
     we direct you to Stack Overflow (SO) where a whole community of PyTorch and
     Tensorflow enthusiast can help you out. Make sure to tag your question with the
     right deep learning framework as well as the huggingface-transformers tag: 
     https://stackoverflow.com/questions/tagged/huggingface-transformers 
     If your question wasn't answered after a period of time on Stack Overflow, you
     can always open a question on GitHub. You should then link to the SO question 
     that you posted.
     -->
 ## Details
 <!-- Description of your issue -->
 <!-- You should first ask your question on SO, and only if
     you didn't get an answer ask it here on GitHub. -->
 **A link to original question on Stack Overflow**: 
--- a/.github/stale.yml
+++ b/.github/stale.yml
@ -0,0 +1,17 @@
 # Number of days of inactivity before an issue becomes stale
 daysUntilStale: 60
 # Number of days of inactivity before a stale issue is closed
 daysUntilClose: 7
 # Issues with these labels will never be considered stale
 exemptLabels:
  - pinned
  - security
 # Label to use when marking an issue as stale
 staleLabel: wontfix
 # Comment to post when marking an issue as stale. Set to `false` to disable
 markComment: >
  This issue has been automatically marked as stale because it has not had
  recent activity. It will be closed if no further activity occurs. Thank you
  for your contributions.
 # Comment to post when closing a stale issue. Set to `false` to disable
 closeComment: false
--- a/.gitignore
+++ b/.gitignore
@ -118,8 +118,24 @@ dmypy.json
 # vscode
 .vscode
 # Pycharm
 .idea
 # TF code
 tensorflow_code
 # Models
-models
+models
 proc_data
 # examples
 runs
 examples/runs
 # data
 /data
 serialization_dir
 # emacs
 *.*~
 debug.env
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,262 @@
 # How to contribute to transformers?
 Everyone is welcome to contribute, and we value everybody's contribution. Code
 is thus not the only way to help the community. Answering questions, helping
 others, reaching out and improving the documentations are immensely valuable to
 the community.
 It also helps us if you spread the word: reference the library from blog posts
 on the awesome projects it made possible, shout out on Twitter every time it has
 helped you, or simply star the repo to say "thank you".
 ## You can contribute in so many ways!
 There are 4 ways you can contribute to transformers:
 * Fixing outstanding issues with the existing code;
 * Implementing new models;
 * Contributing to the examples or to the documentation;
 * Submitting issues related to bugs or desired new features.
 *All are equally valuable to the community.*
 ## Submitting a new issue or feature request
 Do your best to follow these guidelines when submitting an issue or a feature
 request. It will make it easier for us to come back to you quickly and with good
 feedback.
 ### Did you find a bug?
 The transformers are robust and reliable thanks to the users who notify us of
 the problems they encounter. So thank you for reporting an issue.
 First, we would really appreciate it if you could **make sure the bug was not
 already reported** (use the search bar on Github under Issues).
 Did not find it? :( So we can act quickly on it, please follow these steps:
 * Include your **OS type and version**, the versions of **Python**, **PyTorch** and
  **Tensorflow** when applicable;
 * A short, self-contained, code snippet that allows us to reproduce the bug in
  less than 30s;
 * Provide the *full* traceback if an exception is raised.
 To get the OS and software versions, execute the following code and copy-paste
 the output:
 ```
 import platform; print("Platform", platform.platform())
 import sys; print("Python", sys.version)
 import torch; print("PyTorch", torch.__version__)
 import tensorflow; print("Tensorflow", tensorflow.__version__)
 ```
 ### Do you want to implement a new model?
 Awesome! Please provide the following information:
 * Short description of the model and link to the paper;
 * Link to the implementation if it is open-source;
 * Link to the model weights if they are available.
 If you are willing to contribute the model yourself, let us know so we can best
 guide you.
 We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder.
 ### Do you want a new feature (that is not a model)?
 A world-class feature request addresses the following points:
 1. Motivation first:
  * Is it related to a problem/frustration with the library? If so, please explain
    why. Providing a code snippet that demonstrates the problem is best.
  * Is it related to something you would need for a project? We'd love to hear
    about it!
  * Is it something you worked on and think could benefit the community?
    Awesome! Tell us what problem it solved for you.
 2. Write a *full paragraph* describing the feature;
 3. Provide a **code snippet** that demonstrates its future use;
 4. In case this is related to a paper, please attach a link;
 5. Attach any additional information (drawings, screenshots, etc.) you think may help.
 If your issue is well written we're already 80% of the way there by the time you
 post it.
 We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find them in the [`templates`](./templates) folder.
 ## Start contributing! (Pull Requests)
 Before writing code, we strongly advise you to search through the exising PRs or
 issues to make sure that nobody is already working on the same thing. If you are
 unsure, it is always a good idea to open an issue to get some feedback.
 You will need basic `git` proficiency to be able to contribute to
 `transformers`. `git` is not the easiest tool to use but it has the greatest
 manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.
 Follow these steps to start contributing:
 1. Fork the [repository](https://github.com/huggingface/transformers) by
   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
   under your GitHub user account.
 2. Clone your fork to your local disk, and add the base repository as a remote:
   ```bash
   $ git clone git@github.com:<your Github handle>/transformers.git
   $ cd transformers
   $ git remote add upstream https://github.com/huggingface/transformers.git
   ```
 3. Create a new branch to hold your development changes:
   ```bash
   $ git checkout -b a-descriptive-name-for-my-changes
   ```
   **do not** work on the `master` branch.
 4. Set up a development environment by running the following command in a virtual environment:
   ```bash
   $ pip install -e ".[dev]"
   ```
   (If transformers was already installed in the virtual environment, remove
   it with `pip uninstall transformers` before reinstalling it in editable
   mode with the `-e` flag.)
   Right now, we need an unreleased version of `isort` to avoid a
   [bug](https://github.com/timothycrosley/isort/pull/1000):
   ```bash
   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
   ```
 5. Develop the features on your branch.
   As you work on the features, you should make sure that the test suite
   passes:
   ```bash
   $ make test
   ```
   `transformers` relies on `black` and `isort` to format its source code
   consistently. After you make changes, format them with:
   ```bash
   $ make style
   ```
   `transformers` also uses `flake8` to check for coding mistakes. Quality
   control runs in CI, however you can also run the same checks with:
   ```bash
   $ make quality
   ```
   Once you're happy with your changes, add changed files using `git add` and
   make a commit with `git commit` to record your changes locally:
   ```bash
   $ git add modified_file.py
   $ git commit
   ```
   Please write [good commit
   messages](https://chris.beams.io/posts/git-commit/).
   It is a good idea to sync your copy of the code with the original
   repository regularly. This way you can quickly account for changes:
   ```bash
   $ git fetch upstream
   $ git rebase upstream/master
   ```
   Push the changes to your account using:
   ```bash
   $ git push -u origin a-descriptive-name-for-my-changes
   ```
 6. Once you are satisfied (**and the checklist below is happy too**), go to the
   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
   to the project maintainers for review.
 7. It's ok if maintainers ask you for changes. It happens to core contributors
   too! So everyone can see the changes in the Pull request, work in your local
   branch and push the changes to your fork. They will automatically appear in
   the pull request.
 ### Checklist
 1. The title of your pull request should be a summary of its contribution;
 2. If your pull request adresses an issue, please mention the issue number in
   the pull request description to make sure they are linked (and people
   consulting the issue know you are working on it);
 3. To indicate a work in progress please prefix the title with `[WIP]`. These
   are useful to avoid duplicated work, and to differentiate it from PRs ready
   to be merged;
 4. Make sure pre-existing tests still pass;
 5. Add high-coverage tests. No quality test, no merge;
 6. All public methods must have informative docstrings;
 ### Tests
 You can run 🤗 Transformers tests with `unittest` or `pytest`.
 We like `pytest` and `pytest-xdist` because it's faster. From the root of the
 repository, here's how to run tests with `pytest` for the library:
 ```bash
 $ python -m pytest -n auto --dist=loadfile -s -v ./tests/
 ```
 and for the examples:
 ```bash
 $ pip install -r examples/requirements.txt  # only needed the first time
 $ python -m pytest -n auto --dist=loadfile -s -v ./examples/
 ```
 In fact, that's how `make test` and `make test-examples` are implemented!
 You can specify a smaller set of tests in order to test only the feature
 you're working on.
 By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
 `yes` to run them. This will download many gigabytes of models — make sure you
 have enough disk space and a good Internet connection, or a lot of patience!
 ```bash
 $ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
 $ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/
 ```
 Likewise, set the `RUN_CUSTOM_TOKENIZERS` environment variable to `yes` to run
 tests for custom tokenizers, which don't run by default either.
 🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
 `pytest`-specific features in the test suite itself.
 This means `unittest` is fully supported. Here's how to run tests with
 `unittest`:
 ```bash
 $ python -m unittest discover -s tests -t . -v
 $ python -m unittest discover -s examples -t examples -v
 ```
 ### Style guide
 For documentation strings, `transformers` follows the [google
 style](https://google.github.io/styleguide/pyguide.html).
 #### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
--- a/24
+++ b/24
@ -0,0 +1,24 @@
 .PHONY: quality style test test-examples
 # Check that source code meets quality standards
 quality:
 	black --check --line-length 119 --target-version py35 examples templates tests src utils
 	isort --check-only --recursive examples templates tests src utils
 	flake8 examples templates tests src utils
 # Format source code automatically
 style:
 	black --line-length 119 --target-version py35 examples templates tests src utils
 	isort --recursive examples templates tests src utils
 # Run tests for the library
 test:
 	python -m pytest -n auto --dist=loadfile -s -v ./tests/
 # Run tests for examples
 test-examples:
 	python -m pytest -n auto --dist=loadfile -s -v ./examples/
--- a/README.md
+++ b/README.md
--- a/deploy_multi_version_doc.sh
+++ b/deploy_multi_version_doc.sh
@ -0,0 +1,23 @@
 cd docs
 function deploy_doc(){
 	echo "Creating doc at commit $1 and pushing to folder $2"
 	git checkout $1
 	if [ ! -z "$2" ] 
 	then
 		echo "Pushing version" $2
 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
 	else
 		echo "Pushing master"
 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 	fi
 }
 deploy_doc "master" 
 deploy_doc "b33a385" v1.0.0
 deploy_doc "fe02e45" v1.1.0
 deploy_doc "89fd345" v1.2.0
 deploy_doc "fc9faa8" v2.0.0
 deploy_doc "3ddce1d" v2.1.1
 deploy_doc "f2f3294" v2.2.0
 deploy_doc "d0f8b9a" v2.3.0
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
-RUN pip install pytorch-pretrained-bert
+RUN pip install transformers
 WORKDIR /workspace
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,19 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line.
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = _build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,67 @@
 # Generating the documentation
 To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
 you can install them with the following command, at the root of the code repository:
 ```bash
 pip install -e ".[docs]"
 ```
 ## Packages installed
 Here's an overview of all the packages installed. If you ran the previous command installing all packages from
 `requirements.txt`, you do not need to run the following commands.
 Building it requires the package `sphinx` that you can
 install using:
 ```bash
 pip install -U sphinx
 ```
 You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
 [Read The Docs](https://readthedocs.org/). You can install it using the following command:
 ```bash
 pip install sphinx_rtd_theme
 ```
 The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
 ```bash
 pip install recommonmark
 ```
 ## Building the documentation
 Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
 command to generate it:
 ```bash
 ln -s ../../examples/README.md examples.md
 ```
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
 ```bash
 make html
 ```
 ---
 **NOTE**
 If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
 directory before rebuilding. Run the following command to clean and build:
 ```bash
 make clean && make html
 ```
 ---
 It should build the static app that will be available under `/docs/_build/html`
 ## Adding a new element to the tree (toc-tree)
 Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
--- a/docs/source/_static/css/Calibre-Light.ttf
+++ b/docs/source/_static/css/Calibre-Light.ttf
--- a/docs/source/_static/css/Calibre-Medium.otf
+++ b/docs/source/_static/css/Calibre-Medium.otf
--- a/docs/source/_static/css/Calibre-Regular.otf
+++ b/docs/source/_static/css/Calibre-Regular.otf
--- a/docs/source/_static/css/Calibre-Thin.otf
+++ b/docs/source/_static/css/Calibre-Thin.otf
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@ -0,0 +1,12 @@
 .highlight .c1, .highlight .sd{
    color: #999
 }
 .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
    color: #FB8D68;
 }
 .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
    color: #6670FF;
 }
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@ -0,0 +1,196 @@
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #6670FF;
 }
 /* To keep the logo centered */
 .wy-side-scroll {
    width: auto;
    font-size: 20px;
 }
 /* The div that holds the Hugging Face logo */
 .HuggingFaceDiv {
    width: 100%
 }
 /* The research field on top of the toc tree */
 .wy-side-nav-search{
    background-color: #6670FF;
 }
 /* The toc tree */
 .wy-nav-side{
    background-color: #6670FF;
 }
 /* The selected items in the toc tree */
 .wy-menu-vertical li.current{
    background-color: #A6B0FF;
 }
 /* When a list item that does belong to the selected block from the toc tree is hovered */
 .wy-menu-vertical li.current a:hover{
    background-color: #B6C0FF;
 }
 /* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
 .wy-menu-vertical li a:hover{
    background-color: #A7AFFB;
 }
 /* The text items on the toc tree */
 .wy-menu-vertical a {
    color: #FFFFDD;
    font-family: Calibre-Light, sans-serif;
 }
 .wy-menu-vertical header, .wy-menu-vertical p.caption{
    color: white;
    font-family: Calibre-Light, sans-serif;
 }
 /* The color inside the selected toc tree block */
 .wy-menu-vertical li.toctree-l2 a, .wy-menu-vertical li.toctree-l3 a, .wy-menu-vertical li.toctree-l4 a {
    color: black;
 }
 /* Inside the depth-2 selected toc tree block */
 .wy-menu-vertical li.toctree-l2.current>a {
    background-color: #B6C0FF
 }
 .wy-menu-vertical li.toctree-l2.current li.toctree-l3>a {
    background-color: #C6D0FF
 }
 /* Inside the depth-3 selected toc tree block */
 .wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{
    background-color: #D6E0FF
 }
 /* Inside code snippets */
 .rst-content dl:not(.docutils) dt{
    font-size: 15px;
 }
 /* Links */
 a {
    color: #6670FF;
 }
 /* Content bars */
 .rst-content dl:not(.docutils) dt {
    background-color: rgba(251, 141, 104, 0.1);
    border-right: solid 2px #FB8D68;
    border-left: solid 2px #FB8D68;
    color: #FB8D68;
    font-family: Calibre-Light, sans-serif;
    border-top: none;
    font-style: normal !important;
 }
 /* Expand button */
 .wy-menu-vertical li.toctree-l2 span.toctree-expand,
 .wy-menu-vertical li.on a span.toctree-expand, .wy-menu-vertical li.current>a span.toctree-expand,
 .wy-menu-vertical li.toctree-l3 span.toctree-expand{
    color: black;
 }
 /* Max window size */
 .wy-nav-content{
    max-width: 1200px;
 }
 /* Mobile header */
 .wy-nav-top{
    background-color: #6670FF;
 }
 /* Source spans */
 .rst-content .viewcode-link, .rst-content .viewcode-back{
    color: #6670FF;
    font-size: 110%;
    letter-spacing: 2px;
    text-transform: uppercase;
 }
 /* It would be better for table to be visible without horizontal scrolling */
 .wy-table-responsive table td, .wy-table-responsive table th{
    white-space: normal;
 }
 .footer {
    margin-top: 20px;
 }
 .footer__Social {
    display: flex;
    flex-direction: row;
 }
 .footer__CustomImage {
    margin: 2px 5px 0 0;
 }
 /* class and method names in doc */
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
    font-family: Calibre, sans-serif;
    font-size: 20px !important;
 }
 /* class name in doc*/
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
    margin-right: 10px;
    font-family: Calibre-Medium, sans-serif;
 }
 /* Method and class parameters */
 .sig-param{
    line-height: 23px;
 }
 /* Class introduction "class" string at beginning */
 .rst-content dl:not(.docutils) .property{
    font-size: 18px;
    color: black;
 }
 /* FONTS */
 body{
    font-family: Calibre, sans-serif;
    font-size: 16px;
 }
 h1 {
    font-family: Calibre-Thin, sans-serif;
    font-size: 70px;
 }
 h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
    font-family: Calibre-Medium, sans-serif;
 }
@font-face {
    font-family: Calibre-Medium;
    src: url(./Calibre-Medium.otf);
    font-weight:400;
 }
@font-face {
    font-family: Calibre;
    src: url(./Calibre-Regular.otf);
    font-weight:400;
 }
@font-face {
    font-family: Calibre-Light;
    src: url(./Calibre-Light.ttf);
    font-weight:400;
 }
@font-face {
    font-family: Calibre-Thin;
    src: url(./Calibre-Thin.otf);
    font-weight:400;
 }
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
--- a/docs/source/_static/js/huggingface_logo.svg
+++ b/docs/source/_static/js/huggingface_logo.svg
@ -0,0 +1,47 @@
 <svg width="95px" height="88px" viewBox="0 0 95 88" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
    <!-- Generator: Sketch 43.2 (39069) - http://www.bohemiancoding.com/sketch -->
    <title>icon</title>
    <desc>Created with Sketch.</desc>
    <defs>
        <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
    </defs>
    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
        <g id="icon_desktop">
            <g id="icon">
                <g id="icon_desktop">
                    <g id="Group-2">
                        <g id="Group">
                            <path d="M93.7930402,70.08 C94.5430402,72.24 94.3630402,74.54 93.3630402,76.54 C92.6430402,78 91.6130402,79.13 90.3530402,80.14 C88.8330402,81.34 86.9430402,82.36 84.6630402,83.34 C81.9430402,84.5 78.6230402,85.59 77.1030402,85.99 C73.2130402,87 69.4730402,87.64 65.6830402,87.67 C60.2630402,87.72 55.5930402,86.44 52.2730402,83.17 C50.5530402,83.38 48.8130402,83.5 47.0630402,83.5 C45.4030402,83.5 43.7630402,83.4 42.1330402,83.2 C38.8030402,86.45 34.1530402,87.72 28.7530402,87.67 C24.9630402,87.64 21.2230402,87 17.3230402,85.99 C15.8130402,85.59 12.4930402,84.5 9.77304019,83.34 C7.49304019,82.36 5.60304019,81.34 4.09304019,80.14 C2.82304019,79.13 1.79304019,78 1.07304019,76.54 C0.0830401858,74.54 -0.106959814,72.24 0.653040186,70.08 C-0.0469598142,68.43 -0.226959814,66.54 0.323040186,64.45 C0.573040186,63.5 0.983040186,62.62 1.50304019,61.84 C1.39304019,61.43 1.30304019,61.01 1.24304019,60.55 C0.863040186,57.81 1.81304019,55.31 3.60304019,53.37 C4.48304019,52.4 5.43304019,51.73 6.42304019,51.3 C5.69304019,48.2 5.31304019,45.01 5.31304019,41.75 C5.31304019,18.69 24.0030402,0 47.0630402,0 C54.9830402,0 62.3930402,2.2 68.7130402,6.04 C69.8530402,6.74 70.9730402,7.49 72.0430402,8.29 C72.5730402,8.69 73.1030402,9.1 73.6130402,9.53 C74.1330402,9.95 74.6430402,10.39 75.1330402,10.84 C76.6130402,12.19 78.0030402,13.64 79.2730402,15.19 C79.7030402,15.7 80.1130402,16.23 80.5130402,16.77 C81.3230402,17.84 82.0730402,18.95 82.7630402,20.1 C83.8130402,21.82 84.7330402,23.62 85.5330402,25.49 C86.0630402,26.74 86.5230402,28.02 86.9330402,29.33 C87.5430402,31.29 88.0130402,33.31 88.3330402,35.39 C88.4330402,36.08 88.5230402,36.78 88.5930402,37.48 C88.7330402,38.88 88.8130402,40.3 88.8130402,41.75 C88.8130402,44.97 88.4330402,48.13 87.7230402,51.18 C88.8230402,51.61 89.8630402,52.31 90.8330402,53.37 C92.6230402,55.31 93.5730402,57.82 93.1930402,60.56 C93.1330402,61.01 93.0430402,61.43 92.9330402,61.84 C93.4530402,62.62 93.8630402,63.5 94.1130402,64.45 C94.6630402,66.54 94.4830402,68.43 93.7930402,70.08" id="Fill-1" fill="#FFFFFF" fill-rule="nonzero"></path>
                            <circle id="Oval" fill="#FFD21E" fill-rule="nonzero" cx="46.75" cy="41.75" r="34.75"></circle>
                            <path d="M81.5,41.75 C81.5,22.5581049 65.9418951,7 46.75,7 C27.5581049,7 12,22.5581049 12,41.75 C12,60.9418951 27.5581049,76.5 46.75,76.5 C65.9418951,76.5 81.5,60.9418951 81.5,41.75 Z M8,41.75 C8,20.3489659 25.3489659,3 46.75,3 C68.1510341,3 85.5,20.3489659 85.5,41.75 C85.5,63.1510341 68.1510341,80.5 46.75,80.5 C25.3489659,80.5 8,63.1510341 8,41.75 Z" id="Oval" fill="#FFAC03" fill-rule="nonzero"></path>
                            <path d="M57.1723547,31.7151181 C58.0863134,32.7107502 57.3040427,35.2620959 58.7620957,35.2620959 C61.5235194,35.2620959 63.7620957,33.0235196 63.7620957,30.2620959 C63.7620957,27.5006721 61.5235194,25.2620959 58.7620957,25.2620959 C56.0006719,25.2620959 53.7620957,27.5006721 53.7620957,30.2620959 C53.7620957,31.5654666 56.3553563,30.8251108 57.1723547,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(58.762096, 30.262096) rotate(-28.000000) translate(-58.762096, -30.262096) "></path>
                            <path d="M32.1723553,31.7151181 C33.086314,32.7107502 32.3040433,35.2620959 33.7620963,35.2620959 C36.52352,35.2620959 38.7620963,33.0235196 38.7620963,30.2620959 C38.7620963,27.5006721 36.52352,25.2620959 33.7620963,25.2620959 C31.0006725,25.2620959 28.7620963,27.5006721 28.7620963,30.2620959 C28.7620963,31.5654666 31.3553569,30.8251108 32.1723553,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(33.762096, 30.262096) scale(-1, 1) rotate(-28.000000) translate(-33.762096, -30.262096) "></path>
                            <g id="Oval-4" transform="translate(33.500000, 41.500000)">
                                <g id="Mask" fill-rule="nonzero" fill="#3A3B45">
                                    <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
                                </g>
                                <g id="Clipped">
                                    <mask id="mask-2" fill="white">
                                        <use xlink:href="#path-1"></use>
                                    </mask>
                                    <g id="path-1"></g>
                                    <path d="M13.25,25 C18.0399291,25 21.9229338,21.1169953 21.9229338,16.3270662 C21.9229338,12.5962324 19.5672252,9.41560375 16.2620987,8.19147116 C16.1404592,8.14641904 16.0175337,8.10401696 15.8933923,8.06433503 C15.0599892,7.79793679 14.1717882,10.6623144 13.25,10.6623144 C12.3886883,10.6623144 11.5567012,7.77968641 10.7713426,8.01349068 C7.18916268,9.07991937 4.57706621,12.3984489 4.57706621,16.3270662 C4.57706621,21.1169953 8.46007093,25 13.25,25 Z" id="Shape" fill="#EF4E4E" fill-rule="nonzero" mask="url(#mask-2)"></path>
                                </g>
                            </g>
                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="70.25" cy="33.75" r="3.25"></circle>
                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="23.75" cy="33.75" r="3.25"></circle>
                        </g>
                    </g>
                </g>
                <g id="Group-4" transform="translate(3.000000, 48.000000)" fill-rule="nonzero">
                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
                </g>
                <g id="Group-4" transform="translate(70.500000, 66.500000) scale(-1, 1) translate(-70.500000, -66.500000) translate(50.000000, 48.000000)" fill-rule="nonzero">
                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
                </g>
            </g>
        </g>
    </g>
 </svg>
--- a/docs/source/benchmarks.md
+++ b/docs/source/benchmarks.md
@ -0,0 +1,54 @@
 # Benchmarks
 This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
 benchmark will help keep track of the preformance improvements that are brought to our models across versions.
 ## Benchmarking all models for inference
 As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
 and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
 TensorFlow XLA) and GPUs.
 The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
 The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
 ## TF2 with mixed precision, XLA, Distribution (@tlkh)
 This work was done by [Timothy Liu](https://github.com/tlkh).
 There are very positive results to be gained from the various TensorFlow 2.0 features:
 - Automatic Mixed Precision (AMP)
 - XLA compiler
 - Distribution strategies (multi-GPU)
 The benefits are listed here (tested on CoLA, MRPC, SST-2):
 - AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
 - AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
 - Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
 - Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
 The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
 on a single GPU gives the following results:
 - CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
 - MRPC: AMP results in lower acc (0.823 vs 0.835)
 - SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
 However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
 CoLA: AMP results in higher acc (0.828 vs 0.812)
 MRPC: AMP results in lower acc (0.817 vs 0.827)
 SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
 The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
 Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
 as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
 can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
 The benefits as seen on SST-2 (larger dataset) is much clear.
 All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@ -0,0 +1,18 @@
 BERTology
 ---------
 There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
 * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
 * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
 In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
 * accessing all the hidden-states of BERT/GPT/GPT-2,
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
 To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -0,0 +1,188 @@
 # -*- coding: utf-8 -*-
 #
 # Configuration file for the Sphinx documentation builder.
 #
 # This file does only contain a selection of the most common options. For a
 # full list see the documentation:
 # http://www.sphinx-doc.org/en/master/config
 # -- Path setup --------------------------------------------------------------
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import os
 import sys
 sys.path.insert(0, os.path.abspath('../../src'))
 # -- Project information -----------------------------------------------------
 project = u'transformers'
 copyright = u'2019, huggingface'
 author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
 release = u'2.4.0'
 # -- General configuration ---------------------------------------------------
 # If your documentation needs a minimal Sphinx version, state it here.
 #
 # needs_sphinx = '1.0'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
    'recommonmark',
    'sphinx.ext.viewcode',
    'sphinx_markdown_tables'
 ]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 source_suffix = ['.rst', '.md']
 # source_suffix = '.rst'
 # The master toctree document.
 master_doc = 'index'
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
 language = None
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
 html_theme_options = {
    'analytics_id': 'UA-83738774-2'
 }
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
 #
 # The default sidebars (for documents that don't match any pattern) are
 # defined by theme itself.  Builtin themes are using these templates by
 # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
 # 'searchbox.html']``.
 #
 # html_sidebars = {}
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.
 htmlhelp_basename = 'transformersdoc'
 # -- Options for LaTeX output ------------------------------------------------
 latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
 }
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
    (master_doc, 'transformers.tex', u'transformers Documentation',
     u'huggingface', 'manual'),
 ]
 # -- Options for manual page output ------------------------------------------
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
    (master_doc, 'transformers', u'transformers Documentation',
     [author], 1)
 ]
 # -- Options for Texinfo output ----------------------------------------------
 # Grouping the document tree into Texinfo files. List of tuples
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
    (master_doc, 'transformers', u'transformers Documentation',
     author, 'transformers', 'One line description of project.',
     'Miscellaneous'),
 ]
 # -- Options for Epub output -------------------------------------------------
 # Bibliographic Dublin Core info.
 epub_title = project
 # The unique identifier of the text. This can be a ISBN number
 # or the project homepage.
 #
 # epub_identifier = ''
 # A unique identification for the text.
 #
 # epub_uid = ''
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ['search.html']
 def setup(app):
    app.add_stylesheet('css/huggingface.css')
    app.add_stylesheet('css/code-snippets.css')
    app.add_js_file('js/custom.js')
 # -- Extension configuration -------------------------------------------------
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@ -0,0 +1,113 @@
 Converting Tensorflow Checkpoints
 ================================================
 A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
 .. note::
    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**)
    available in any transformers >= 2.3.0 installation.
    The documentation below reflects the **transformers-cli convert** command format.
 BERT
 ^^^^
 You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
 To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
 Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
 .. code-block:: shell
   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
   transformers-cli convert --model_type bert \
     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
     --config $BERT_BASE_DIR/bert_config.json \
     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
 You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
 OpenAI GPT
 ^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\ )
 .. code-block:: shell
   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
   transformers-cli convert --model_type gpt \
     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
     [--config OPENAI_GPT_CONFIG] \
     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
 OpenAI GPT-2
 ^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here <https://github.com/openai/gpt-2>`__\ )
 .. code-block:: shell
   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
   transformers-cli convert --model_type gpt2 \
     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
     [--config OPENAI_GPT2_CONFIG] \
     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
 Transformer-XL
 ^^^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )
 .. code-block:: shell
   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
   transformers-cli convert --model_type transfo_xl \
     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
     [--config TRANSFO_XL_CONFIG] \
     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
 XLNet
 ^^^^^
 Here is an example of the conversion process for a pre-trained XLNet model:
 .. code-block:: shell
   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
   transformers-cli convert --model_type xlnet \
     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
     --config $TRANSFO_XL_CONFIG_PATH \
     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
     [--finetuning_task_name XLNET_FINETUNED_TASK] \
 XLM
 ^^^
 Here is an example of the conversion process for a pre-trained XLM model:
 .. code-block:: shell
   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
   transformers-cli convert --model_type xlm \
     --tf_checkpoint $XLM_CHECKPOINT_PATH \
     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
    [--config XML_CONFIG] \
    [--finetuning_task_name XML_FINETUNED_TASK]
--- a/docs/source/examples.md
+++ b/docs/source/examples.md
@ -0,0 +1 @@
 ../../examples/README.md
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@ -0,0 +1,145 @@
 Glossary
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
 detailed here alongside usage examples.
 Input IDs
 --------------------------
 The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
 numerical representations of tokens building the sequences that will be used as input by the model*.
 Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
 tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
 ::
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    sequence = "A Titan RTX has 24GB of VRAM"
 The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
 ::
    # Continuation of the previous script
    tokenized_sequence = tokenizer.tokenize(sequence)
    assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
 These tokens can then be converted into IDs which are understandable by the model. Several methods are available for
 this, the recommended being `encode` or `encode_plus`, which leverage the Rust implementation of
 `huggingface/tokenizers <https://github.com/huggingface/tokenizers>`__ for peak performance.
 ::
    # Continuation of the previous script
    encoded_sequence = tokenizer.encode(sequence)
    assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
 The `encode` and `encode_plus` methods automatically add "special tokens" which are special IDs the model uses.
 Attention mask
 --------------------------
 The attention mask is an optional argument used when batching sequences together. This argument indicates to the
 model which tokens should be attended to, and which should not.
 For example, consider these two sequences:
 ::
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    sequence_a = "This is a short sequence."
    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
    encoded_sequence_a = tokenizer.encode(sequence_a)
    assert len(encoded_sequence_a) == 8
    encoded_sequence_b = tokenizer.encode(sequence_b)
    assert len(encoded_sequence_b) == 19
 These two sequences have different lengths and therefore can't be put together in a same tensor as-is. The first
 sequence needs to be padded up to the length of the second one, or the second one needs to be truncated down to
 the length of the first one.
 In the first case, the list of IDs will be extended by the padding indices:
 ::
    # Continuation of the previous script
    padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)
    assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
    assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
 These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
 the position of the padded indices so that the model does not attend to them. For the
 :class:`~transformers.BertTokenizer`, :obj:`1` indicate a value that should be attended to while :obj:`0` indicate
 a padded value.
 The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to obtain the attention mask directly:
 ::
    # Continuation of the previous script
    sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)
    assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    assert sequence_a_dict['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 Token Type IDs
 --------------------------
 Some models' purpose is to do sequence classification or question answering. These require two different sequences to
 be encoded in the same input IDs. They are usually separated by special tokens, such as the classifier and separator
 tokens. For example, the BERT model builds its two sequence input as such:
 ::
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    # [CLS] SEQ_A [SEP] SEQ_B [SEP]
    sequence_a = "HuggingFace is based in NYC"
    sequence_b = "Where is HuggingFace based?"
    encoded_sequence = tokenizer.encode(sequence_a, sequence_b)
    assert tokenizer.decode(encoded_sequence) == "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
 This is enough for some models to understand where one sequence ends and where another begins. However, other models
 such as BERT have an additional mechanism, which are the segment IDs. The Token Type IDs are a binary mask identifying
 the different sequences in the model.
 We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output the Token Type IDs for us:
 ::
    # Continuation of the previous script
    encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
    assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
    assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
 question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
 additional token represented by a :obj:`2`.
 Position IDs
 --------------------------
 The position IDs are used by the model to identify which token is at which position. Contrary to RNNs that have the
 position of each token embedded within them, transformers are unaware of the position of each token. The position
 IDs are created for this purpose.
 They are an optional parameter. If no position IDs are passed to the model, they are automatically created as absolute
 positional embeddings.
 Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
 use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
--- a/docs/source/imgs/transformers_logo_name.png
+++ b/docs/source/imgs/transformers_logo_name.png
--- a/docs/source/imgs/warmup_constant_schedule.png
+++ b/docs/source/imgs/warmup_constant_schedule.png
--- a/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
+++ b/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
--- a/docs/source/imgs/warmup_cosine_schedule.png
+++ b/docs/source/imgs/warmup_cosine_schedule.png
--- a/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
+++ b/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
--- a/docs/source/imgs/warmup_linear_schedule.png
+++ b/docs/source/imgs/warmup_linear_schedule.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -0,0 +1,102 @@
 Transformers
 ================================================================================================================================================
 🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
 (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
 (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
 This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
 Features
 ---------------------------------------------------
 - As easy to use as pytorch-transformers
 - As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners
 State-of-the-art NLP for everyone:
 - Deep learning researchers
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators
 Lower compute costs, smaller carbon footprint:
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
 - 8 architectures with over 30 pretrained models, some in more than 100 languages
 Choose the right framework for every part of a model's lifetime:
 - Train state-of-the-art models in 3 lines of code
 - Deep interoperability between TensorFlow 2.0 and PyTorch models
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production
 Contents
 ---------------------------------
 The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
 9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
 11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 .. toctree::
    :maxdepth: 2
    :caption: Notes
    installation
    quickstart
    glossary
    pretrained_models
    model_sharing
    examples
    notebooks
    serialization
    converting_tensorflow_models
    migration
    bertology
    torchscript
    multilingual
    benchmarks
 .. toctree::
    :maxdepth: 2
    :caption: Main classes
    main_classes/configuration
    main_classes/model
    main_classes/tokenizer
    main_classes/optimizer_schedules
    main_classes/processors
 .. toctree::
    :maxdepth: 2
    :caption: Package Reference
    model_doc/auto
    model_doc/bert
    model_doc/gpt
    model_doc/transformerxl
    model_doc/gpt2
    model_doc/xlm
    model_doc/xlnet
    model_doc/roberta
    model_doc/distilbert
    model_doc/ctrl
    model_doc/camembert
    model_doc/albert
    model_doc/xlmroberta
    model_doc/flaubert
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -0,0 +1,51 @@
 # Installation
 Transformers is tested on Python 3.5+ and PyTorch 1.1.0
 ## With pip
 PyTorch Transformers can be installed using pip as follows:
 ``` bash
 pip install transformers
 ```
 ## From source
 To install from source, clone the repository and install with:
 ``` bash
 git clone https://github.com/huggingface/transformers.git
 cd transformers
 pip install .
 ```
 ## Tests
 An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests.
 ## OpenAI GPT original tokenization workflow
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
 ``` bash
 pip install spacy ftfy==4.4.3
 python -m spacy download en
 ```
 If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
 ## Note on model downloads (Continuous Integration or large-scale deployments)
 If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
 ## Do you want to run a Transformer model on a mobile device?
 You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
 It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
 At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
 or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@ -0,0 +1,10 @@
 Configuration
 ----------------------------------------------------
 The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 ``PretrainedConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.PretrainedConfig
    :members:
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@ -0,0 +1,21 @@
 Models
 ----------------------------------------------------
 The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 ``PreTrainedModel`` also implements a few methods which are common among all the models to:
 - resize the input token embeddings when new tokens are added to the vocabulary
 - prune the attention heads of the model.
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.PreTrainedModel
    :members:
 ``TFPreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFPreTrainedModel
    :members:
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@ -0,0 +1,72 @@
 Optimizer
 ----------------------------------------------------
 The ``.optimization`` module provides:
 - an optimizer with weight decay fixed that can be used to fine-tuned models, and
 - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
 - a gradient accumulation class to accumulate the gradients of multiple batches
 ``AdamW``
 ~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AdamW
    :members:
 ``AdamWeightDecay``
 ~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AdamWeightDecay
    :members:
 .. autofunction:: transformers.create_optimizer
 Schedules
 ----------------------------------------------------
 Learning Rate Schedules
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: transformers.get_constant_schedule
 .. autofunction:: transformers.get_constant_schedule_with_warmup
 .. image:: /imgs/warmup_constant_schedule.png
    :target: /imgs/warmup_constant_schedule.png
    :alt:
 .. autofunction:: transformers.get_cosine_schedule_with_warmup
 .. image:: /imgs/warmup_cosine_schedule.png
    :target: /imgs/warmup_cosine_schedule.png
    :alt:
 .. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
    :target: /imgs/warmup_cosine_hard_restarts_schedule.png
    :alt:
 .. autofunction:: transformers.get_linear_schedule_with_warmup
 .. image:: /imgs/warmup_linear_schedule.png
    :target: /imgs/warmup_linear_schedule.png
    :alt:
 ``Warmup``
 ~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.WarmUp
    :members:
 Gradient Strategies
 ----------------------------------------------------
 ``GradientAccumulator``
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.GradientAccumulator
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@ -0,0 +1,153 @@
 Processors
 ----------------------------------------------------
 This library includes processors for several traditional tasks. These processors can be used to process a dataset into
 examples that can be fed to a model.
 Processors
 ~~~~~~~~~~~~~~~~~~~~~
 All processors follow the same architecture which is that of the
 :class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
 of :class:`~transformers.data.processors.utils.InputExample`. These
 :class:`~transformers.data.processors.utils.InputExample` can be converted to
 :class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
 .. autoclass:: transformers.data.processors.utils.DataProcessor
    :members:
 .. autoclass:: transformers.data.processors.utils.InputExample
    :members:
 .. autoclass:: transformers.data.processors.utils.InputFeatures
    :members:
 GLUE
 ~~~~~~~~~~~~~~~~~~~~~
 `General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
 the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
 `GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
 This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
 CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
 Those processors are:
    - :class:`~transformers.data.processors.utils.MrpcProcessor`
    - :class:`~transformers.data.processors.utils.MnliProcessor`
    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
    - :class:`~transformers.data.processors.utils.Sst2Processor`
    - :class:`~transformers.data.processors.utils.StsbProcessor`
    - :class:`~transformers.data.processors.utils.QqpProcessor`
    - :class:`~transformers.data.processors.utils.QnliProcessor`
    - :class:`~transformers.data.processors.utils.RteProcessor`
    - :class:`~transformers.data.processors.utils.WnliProcessor`
 Additionally, the following method  can be used to load values from a data file and convert them to a list of
 :class:`~transformers.data.processors.utils.InputExample`.
 .. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
 XNLI
 ~~~~~~~~~~~~~~~~~~~~~
 `The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
 the quality of cross-lingual text representations. 
 XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment 
 annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
 It was released together with the paper
 `XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
 This library hosts the processor to load the XNLI data:
    - :class:`~transformers.data.processors.utils.XnliProcessor`
 Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
 An example using these processors is given in the
 `run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
 SQuAD
 ~~~~~~~~~~~~~~~~~~~~~
 `The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
 the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
 `SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside 
 the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
 This library hosts a processor for each of the two versions:
 Processors
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 Those processors are:
    - :class:`~transformers.data.processors.utils.SquadV1Processor`
    - :class:`~transformers.data.processors.utils.SquadV2Processor`
 They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
 .. autoclass:: transformers.data.processors.squad.SquadProcessor
    :members:
 Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
 that can be used as model inputs.
 .. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
 These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
 Examples are given below.
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is an example using the processors as well as the conversion method using data files:
 Example::
    # Loading a V2 processor
    processor = SquadV2Processor()
    examples = processor.get_dev_examples(squad_v2_data_dir)
    # Loading a V1 processor
    processor = SquadV1Processor()
    examples = processor.get_dev_examples(squad_v1_data_dir)
    features = squad_convert_examples_to_features( 
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
    )
 Using `tensorflow_datasets` is as easy as using a data file:
 Example::
    # tensorflow_datasets only handle Squad V1.
    tfds_examples = tfds.load("squad")
    examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
    features = squad_convert_examples_to_features( 
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
    )
 Another example using these processors is given in the
 `run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@ -0,0 +1,16 @@
 Tokenizer
 ----------------------------------------------------
 The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
 ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
 - tokenizing, converting tokens to ids and back and encoding/decoding,
 - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
 - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.PreTrainedTokenizer
    :members:
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@ -0,0 +1,109 @@
 # Migrating from pytorch-pretrained-bert
 Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
 ### Models always output `tuples`
 The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 ```python
 # Let's load our model
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
 # Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
 # In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 # And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
 ```
 ### Serialization
 Breaking change in the `from_pretrained()`method:
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
 2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method.
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 Here is an example:
 ```python
 ### Let's load a model and tokenizer
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 ### Do some stuff to our model and tokenizer
 # Ex: add new tokens to the vocabulary and embeddings of our model
 tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
 model.resize_token_embeddings(len(tokenizer))
 # Train our model
 train(model)
 ### Now let's save our model and tokenizer to a directory
 model.save_pretrained('./my_saved_model_directory/')
 tokenizer.save_pretrained('./my_saved_model_directory/')
 ### Reload the model and the tokenizer
 model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
 tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
 ```
 ### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
 The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
 - it only implements weights decay correction,
 - schedules are now externals (see below),
 - gradient clipping is now also external (see below).
 The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
 The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
 Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
 ```python
 # Parameters:
 lr = 1e-3
 max_grad_norm = 1.0
 num_training_steps = 1000
 num_warmup_steps = 100
 warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
 ### Previously BertAdam optimizer was instantiated like this:
 optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
 ### and used like this:
 for batch in train_data:
    loss = model(batch)
    loss.backward()
    optimizer.step()
 ### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
 for batch in train_data:
    loss = model(batch)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    optimizer.step()
    scheduler.step()
 ```
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@ -0,0 +1,93 @@
 ALBERT
 ----------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~
 The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
 by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
 two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT:
 - Splitting the embedding matrix into two smaller matrices
 - Using repeating layers split among groups
 The abstract from the paper is the following:
 *Increasing model size when pretraining natural language representations often results in improved performance on
 downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations,
 longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
 techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
 that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
 self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream
 tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE,
 RACE, and SQuAD benchmarks while having fewer parameters compared to BERT-large.*
 Tips:
 - ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
  the right rather than the left.
 - ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.
 AlbertConfig
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AlbertConfig
    :members:
 AlbertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AlbertTokenizer
    :members:
 AlbertModel
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AlbertModel
    :members:
 AlbertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AlbertForMaskedLM
    :members:
 AlbertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AlbertForSequenceClassification
    :members:
 AlbertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AlbertForQuestionAnswering
    :members:
 TFAlbertModel
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFAlbertModel
    :members:
 TFAlbertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFAlbertForMaskedLM
    :members:
 TFAlbertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFAlbertForSequenceClassification
    :members:
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@ -0,0 +1,65 @@
 AutoModels
 -----------
 In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
 AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:
 Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
 ``AutoConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoConfig
    :members:
 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoTokenizer
    :members:
 ``AutoModel``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoModel
    :members:
 ``AutoModelForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoModelForPreTraining
    :members:
 ``AutoModelWithLMHead``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoModelWithLMHead
    :members:
 ``AutoModelForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoModelForSequenceClassification
    :members:
 ``AutoModelForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoModelForQuestionAnswering
    :members:
 ``AutoModelForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.AutoModelForTokenClassification
    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@ -0,0 +1,162 @@
 BERT
 ----------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~
 The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__
 by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
 pre-trained using a combination of masked language modeling objective and next sentence prediction
 on a large corpus comprising the Toronto Book Corpus and Wikipedia.
 The abstract from the paper is the following:
 *We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
 from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
 representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
 the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
 for a wide range of tasks, such as question answering and language inference, without substantial task-specific
 architecture modifications.*
 *BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
 language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
 accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
 improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
 Tips:
 - BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
  the right rather than the left.
 - BERT was trained with a masked language modeling (MLM) objective. It is therefore efficient at predicting masked
  tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language
  modeling (CLM) objective are better in that regard.
 - Alongside MLM, BERT was trained using a next sentence prediction (NSP) objective using the [CLS] token as a sequence
  approximate. The user may use this token (the first token in a sequence built with special tokens) to get a sequence
  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
  the [CLS] token.
 BertConfig
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertConfig
    :members:
 BertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertTokenizer
    :members:
 BertModel
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertModel
    :members:
 BertForPreTraining
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertForPreTraining
    :members:
 BertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertForMaskedLM
    :members:
 BertForNextSentencePrediction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertForNextSentencePrediction
    :members:
 BertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertForSequenceClassification
    :members:
 BertForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertForMultipleChoice
    :members:
 BertForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertForTokenClassification
    :members:
 BertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertForQuestionAnswering
    :members:
 TFBertModel
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertModel
    :members:
 TFBertForPreTraining
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForPreTraining
    :members:
 TFBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForMaskedLM
    :members:
 TFBertForNextSentencePrediction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForNextSentencePrediction
    :members:
 TFBertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForSequenceClassification
    :members:
 TFBertForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForMultipleChoice
    :members:
 TFBertForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForTokenClassification
    :members:
 TFBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@ -0,0 +1,99 @@
 CamemBERT
 ----------------------------------------------------
 The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
 by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
 Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
 trained on 138GB of French text.
 The abstract from the paper is the following:
 *Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success,
 most available models have either been trained on English data or on the concatenation of data in multiple
 languages. This makes practical use of such models --in all languages except English-- very limited. Aiming
 to address this issue for French, we release CamemBERT, a French version of the Bi-directional Encoders for
 Transformers (BERT). We measure the performance of CamemBERT compared to multilingual models in multiple
 downstream tasks, namely part-of-speech tagging, dependency parsing, named-entity recognition, and natural
 language inference. CamemBERT improves the state of the art for most of the tasks considered. We release the
 pretrained model for CamemBERT hoping to foster research and downstream applications for French NLP.*
 Tips:
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.
 CamembertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CamembertConfig
    :members:
 CamembertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CamembertTokenizer
    :members:
 CamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CamembertModel
    :members:
 CamembertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CamembertForMaskedLM
    :members:
 CamembertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CamembertForSequenceClassification
    :members:
 CamembertForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CamembertForMultipleChoice
    :members:
 CamembertForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CamembertForTokenClassification
    :members:
 TFCamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFCamembertModel
    :members:
 TFCamembertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFCamembertForMaskedLM
    :members:
 TFCamembertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFCamembertForSequenceClassification
    :members:
 TFCamembertForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFCamembertForTokenClassification
    :members:
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@ -0,0 +1,75 @@
 CTRL
 ----------------------------------------------------
 CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_
 by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
 corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
 The abstract from the paper is the following:
 *Large-scale language models show promising text generation capabilities, but users cannot easily control particular
 aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
 trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
 derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning
 while providing more explicit control over text generation. These codes also allow CTRL to predict which parts of
 the training data are most likely given a sequence. This provides a potential method for analyzing large amounts
 of data via model-based source attribution.*
 Tips:
 - CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
  or links to generate coherent text. Refer to the `original implementation <https://github.com/salesforce/ctrl>`__
  for more information.
 - CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
  the right rather than the left.
 - CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as
  it can be observed in the `run_generation.py` example script.
 - The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
  of this argument.
 CTRLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CTRLConfig
    :members:
 CTRLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CTRLTokenizer
    :members:
 CTRLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CTRLModel
    :members:
 CTRLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.CTRLLMHeadModel
    :members:
 TFCTRLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFCTRLModel
    :members:
 TFCTRLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFCTRLLMHeadModel
    :members:
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@ -0,0 +1,97 @@
 DistilBERT
 ----------------------------------------------------
 The DistilBERT model was proposed in the blog post
 `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__,
 and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
 DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base. It has 40% less
 parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on
 the GLUE language understanding benchmark.
 The abstract from the paper is the following:
 *As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
 operating these large models in on-the-edge and/or under constrained computational training or inference budgets
 remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
 model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
 counterparts. While most prior work investigated the use of distillation for building task-specific models, we
 leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a
 BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage
 the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language
 modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train
 and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative
 on-device study.*
 Tips:
 - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
 - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
 DistilBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.DistilBertConfig
    :members:
 DistilBertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.DistilBertTokenizer
    :members:
 DistilBertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.DistilBertModel
    :members:
 DistilBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.DistilBertForMaskedLM
    :members:
 DistilBertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.DistilBertForSequenceClassification
    :members:
 DistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.DistilBertForQuestionAnswering
    :members:
 TFDistilBertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertModel
    :members:
 TFDistilBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForMaskedLM
    :members:
 TFDistilBertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForSequenceClassification
    :members:
 TFDistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@ -0,0 +1,72 @@
 FlauBERT
 ----------------------------------------------------
 The FlauBERT model was proposed in the paper
 `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
 It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
 The abstract from the paper is the following:
 *Language models have become a key step to achieve state-of-the art results in many different Natural Language
 Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient
 way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
 contextualization at the sentence level. This has been widely demonstrated for English using contextualized
 representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et
 al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large
 and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre
 for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
 classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most
 of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified
 evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
 to the research community for further reproducible experiments in French NLP.*
 FlaubertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.FlaubertConfig
    :members:
 FlaubertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.FlaubertTokenizer
    :members:
 FlaubertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.FlaubertModel
    :members:
 FlaubertWithLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.FlaubertWithLMHeadModel
    :members:
 FlaubertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.FlaubertForSequenceClassification
    :members:
 FlaubertForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.FlaubertForQuestionAnsweringSimple
    :members:
 FlaubertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.FlaubertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@ -0,0 +1,92 @@
 OpenAI GPT
 ----------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~
 OpenAI GPT model was proposed in `Improving Language Understanding by Generative Pre-Training <https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf>`__
 by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional)
 transformer pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
 The abstract from the paper is the following:
 *Natural language understanding comprises a wide range of diverse tasks such
 as textual entailment, question answering, semantic similarity assessment, and
 document classification. Although large unlabeled text corpora are abundant,
 labeled data for learning these specific tasks is scarce, making it challenging for
 discriminatively trained models to perform adequately. We demonstrate that large
 gains on these tasks can be realized by generative pre-training of a language model
 on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each
 specific task. In contrast to previous approaches, we make use of task-aware input
 transformations during fine-tuning to achieve effective transfer while requiring
 minimal changes to the model architecture. We demonstrate the effectiveness of
 our approach on a wide range of benchmarks for natural language understanding.
 Our general task-agnostic model outperforms discriminatively trained models that
 use architectures specifically crafted for each task, significantly improving upon the
 state of the art in 9 out of the 12 tasks studied.*
 Tips:
 - GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
  the right rather than the left.
 - GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
  it can be observed in the `run_generation.py` example script.
 `Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT is one of them.
 OpenAIGPTConfig
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.OpenAIGPTConfig
    :members:
 OpenAIGPTTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.OpenAIGPTTokenizer
    :members:
 OpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.OpenAIGPTModel
    :members:
 OpenAIGPTLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.OpenAIGPTLMHeadModel
    :members:
 OpenAIGPTDoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
    :members:
 TFOpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTModel
    :members:
 TFOpenAIGPTLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTLMHeadModel
    :members:
 TFOpenAIGPTDoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
    :members:
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@ -0,0 +1,91 @@
 OpenAI GPT2
 ----------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~
 OpenAI GPT-2 model was proposed in
 `Language Models are Unsupervised Multitask Learners`_
 by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
 corpus of ~40 GB of text data.
 The abstract from the paper is the following:
 *GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1]
 of 8 million web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous
 words within some text. The diversity of the dataset causes this simple goal to contain naturally occurring
 demonstrations of many tasks across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X
 the parameters and trained on more than 10X the amount of data.*
 Tips:
 - GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
  the right rather than the left.
 - GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
  it can be observed in the `run_generation.py` example script.
 - The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
  of this argument.
 `Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
 different sizes: small, medium, large, xl and a distilled version of the small checkpoint: distilgpt-2.
 GPT2Config
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.GPT2Config
    :members:
 GPT2Tokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.GPT2Tokenizer
    :members:
 GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.GPT2Model
    :members:
 GPT2LMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.GPT2LMHeadModel
    :members:
 GPT2DoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.GPT2DoubleHeadsModel
    :members:
 TFGPT2Model
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2Model
    :members:
 TFGPT2LMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2LMHeadModel
    :members:
 TFGPT2DoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2DoubleHeadsModel
    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@ -0,0 +1,94 @@
 RoBERTa
 ----------------------------------------------------
 The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_
 by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
 Veselin Stoyanov. It is based on Google's BERT model released in 2018.
 It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
 objective and training with much larger mini-batches and learning rates.
 The abstract from the paper is the following:
 *Language model pretraining has led to significant performance gains but careful comparison between different
 approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
 and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
 study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
 training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of
 every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These
 results highlight the importance of previously overlooked design choices, and raise questions about the source
 of recently reported improvements. We release our models and code.*
 Tips:
 - This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a
  setup for Roberta pretrained models.
 - `Camembert <./camembert.html>`__ is a wrapper around RoBERTa. Refer to this page for usage examples.
 RobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.RobertaConfig
    :members:
 RobertaTokenizer
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.RobertaTokenizer
    :members:
 RobertaModel
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.RobertaModel
    :members:
 RobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.RobertaForMaskedLM
    :members:
 RobertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.RobertaForSequenceClassification
    :members:
 RobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.RobertaForTokenClassification
    :members:
 TFRobertaModel
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaModel
    :members:
 TFRobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaForMaskedLM
    :members:
 TFRobertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaForSequenceClassification
    :members:
 TFRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaForTokenClassification
    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@ -0,0 +1,73 @@
 Transformer XL
 ----------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~
 The Transformer-XL model was proposed in
 `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__
 by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
 previously computed hidden-states to attend to longer context (memory).
 This model also uses adaptive softmax inputs and outputs (tied).
 The abstract from the paper is the following:
 *Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
 setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
 beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and
 a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves
 the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and
 450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up
 to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results
 of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on
 Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
 coherent, novel text articles with thousands of tokens.*
 Tips:
 - Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right.
  The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
 - Transformer-XL is one of the few models that has no sequence length limit.
 TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TransfoXLConfig
    :members:
 TransfoXLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TransfoXLTokenizer
    :members:
 TransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TransfoXLModel
    :members:
 TransfoXLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TransfoXLLMHeadModel
    :members:
 TFTransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFTransfoXLModel
    :members:
 TFTransfoXLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFTransfoXLLMHeadModel
    :members:
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@ -0,0 +1,106 @@
 XLM
 ----------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~
 The XLM model was proposed in `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_
 by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
 - a causal language modeling (CLM) objective (next token prediction),
 - a masked language modeling (MLM) objective (Bert-like), or
 - a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
 The abstract from the paper is the following:
 *Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
 In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining.
 We propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
 data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
 state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI,
 our approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation,
 we obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On
 supervised machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming
 the previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
 Tips:
 - XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
 - XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the
  `multi-lingual <../multilingual.html>`__ page for more information.
 XLMConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMConfig
    :members:
 XLMTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMTokenizer
    :members:
 XLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMModel
    :members:
 XLMWithLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMWithLMHeadModel
    :members:
 XLMForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMForSequenceClassification
    :members:
 XLMForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMForQuestionAnsweringSimple
    :members:
 XLMForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMForQuestionAnswering
    :members:
 TFXLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMModel
    :members:
 TFXLMWithLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMWithLMHeadModel
    :members:
 TFXLMForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMForSequenceClassification
    :members:
 TFXLMForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
    :members:
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@ -0,0 +1,102 @@
 XLM-RoBERTa
 ------------------------------------------
 The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
 by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
 Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
 It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
 The abstract from the paper is the following:
 *This paper shows that pretraining multilingual language models at scale leads to significant performance gains for
 a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
 languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
 outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy
 on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
 low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model.
 We also present a detailed empirical evaluation of the key factors that are required to achieve these gains,
 including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and
 low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling
 without sacrificing per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE
 and XNLI benchmarks. We will make XLM-R code, data, and models publicly available.*
 Tips:
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.
 XLMRobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMRobertaConfig
    :members:
 XLMRobertaTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMRobertaTokenizer
    :members:
 XLMRobertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMRobertaModel
    :members:
 XLMRobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMRobertaForMaskedLM
    :members:
 XLMRobertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMRobertaForSequenceClassification
    :members:
 XLMRobertaForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMRobertaForMultipleChoice
    :members:
 XLMRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLMRobertaForTokenClassification
    :members:
 TFXLMRobertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMRobertaModel
    :members:
 TFXLMRobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMRobertaForMaskedLM
    :members:
 TFXLMRobertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMRobertaForSequenceClassification
    :members:
 TFXLMRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMRobertaForTokenClassification
    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@ -0,0 +1,124 @@
 XLNet
 ----------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~
 The XLNet model was proposed in `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_
 by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
 to learn bidirectional contexts by maximizing the expected likelihood over all permutations
 of the input sequence factorization order.
 The abstract from the paper is the following:
 *With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
 better performance than pretraining approaches based on autoregressive language modeling. However, relying on
 corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
 pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
 pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over
 all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
 formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model,
 into pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by
 a large margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
 Tips:
 - The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
 - Due to the difficulty of training a fully auto-regressive model over various factorization order,
  XLNet is pretrained using only a sub-set of the output tokens as target which are selected
  with the `target_mapping` input.
 - To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
 - XLNet is one of the few models that has no sequence length limit.
 XLNetConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetConfig
    :members:
 XLNetTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetTokenizer
    :members:
 XLNetModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetModel
    :members:
 XLNetLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetLMHeadModel
    :members:
 XLNetForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetForSequenceClassification
    :members:
 XLNetForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetForTokenClassification
    :members:
 XLNetForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetForMultipleChoice
    :members:
 XLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetForQuestionAnsweringSimple
    :members:
 XLNetForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.XLNetForQuestionAnswering
    :members:
 TFXLNetModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetModel
    :members:
 TFXLNetLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetLMHeadModel
    :members:
 TFXLNetForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetForSequenceClassification
    :members:
 TFXLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
    :members:
--- a/docs/source/model_sharing.md
+++ b/docs/source/model_sharing.md
@ -0,0 +1,45 @@
 # Model upload and sharing
 Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
 **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
 ```shell
 transformers-cli login
 # log in using the same credentials as on huggingface.co
 ```
 Upload your model:
 ```shell
 transformers-cli upload ./path/to/pretrained_model/
 # ^^ Upload folder containing weights/tokenizer/config
 # saved via `.save_pretrained()`
 transformers-cli upload ./config.json [--filename folder/foobar.json]
 # ^^ Upload a single file
 # (you can optionally override its filename, which can be nested inside a folder)
 ```
 Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
 ```python
 "username/pretrained_model"
 ```
 Anyone can load it from code:
 ```python
 tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
 model = AutoModel.from_pretrained("username/pretrained_model")
 ```
 Finally, list all your files on S3:
 ```shell
 transformers-cli s3 ls
 # List all your S3 objects.
 ```
 You can also delete files:
 ```shell
 transformers-cli s3 rm …
 ```
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@ -0,0 +1,103 @@
 Multi-lingual models
 ================================================
 Most of the models available in this library are mono-lingual models (English, Chinese and German). A few
 multi-lingual models are available and have a different mechanisms than mono-lingual models.
 This page details the usage of these models.
 The two models that currently support multiple languages are BERT and XLM.
 XLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 XLM has a total of 10 different checkpoints, only one of which is mono-lingual. The 9 remaining model checkpoints can
 be split in two categories: the checkpoints that make use of language embeddings, and those that don't
 XLM & Language Embeddings
 ------------------------------------------------
 This section concerns the following checkpoints:
 - ``xlm-mlm-ende-1024`` (Masked language modeling, English-German)
 - ``xlm-mlm-enfr-1024`` (Masked language modeling, English-French)
 - ``xlm-mlm-enro-1024`` (Masked language modeling, English-Romanian)
 - ``xlm-mlm-xnli15-1024`` (Masked language modeling, XNLI languages)
 - ``xlm-mlm-tlm-xnli15-1024`` (Masked language modeling + Translation, XNLI languages)
 - ``xlm-clm-enfr-1024`` (Causal language modeling, English-French)
 - ``xlm-clm-ende-1024`` (Causal language modeling, English-German)
 These checkpoints require language embeddings that will specify the language used at inference time. These language
 embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in
 these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes
 from the tokenizer.
 Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French):
 .. code-block::
    import torch
    from transformers import XLMTokenizer, XLMWithLMHeadModel
    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
 The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
 ``lang2id`` attribute:
 .. code-block::
    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
 These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
 .. code-block::
    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
 We should now define the language embedding by using the previously defined language id. We want to create a tensor
 filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
 .. code-block::
    language_id = tokenizer.lang2id['en']  # 0
    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
    # We reshape it to be of size (batch_size, sequence_length)
    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
 You can then feed it all as input to your model:
 .. code-block::
    outputs = model(input_ids, langs=langs)
 The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
 can generate text using the CLM checkpoints from XLM, using the language embeddings.
 XLM without Language Embeddings
 ------------------------------------------------
 This section concerns the following checkpoints:
 - ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages)
 - ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages)
 These checkpoints do not require language embeddings at inference time. These models are used to have generic
 sentence representations, differently from previously-mentioned XLM checkpoints.
 BERT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 BERT has two checkpoints that can be used for multi-lingual tasks:
 - ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages)
 - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)
 These checkpoints do not require language embeddings at inference time. They should identify the language
 used in the context and infer accordingly.
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@ -0,0 +1,16 @@
 Notebooks
 ================================================
 We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 *
  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 *
  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 *
  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
 Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@ -0,0 +1,272 @@
 Pretrained models
 ================================================
 Here is the full list of the currently provided pretrained models together with a short presentation of each model.
 For a list that includes community-uploaded models, refer to `https://huggingface.co/models <https://huggingface.co/models>`__.
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Architecture      | Shortcut name                                              | Details of the model                                                                                                                  |
 +===================+============================================================+=======================================================================================================================================+
 | BERT              | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on lower-cased English text.                                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased``                                     | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | Trained on lower-cased English text.                                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased``                                        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased English text.                                                                                                      |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased``                                       | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | Trained on cased English text.                                                                                                      |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-multilingual-uncased``                         | | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                        |
 |                   |                                                            | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                                    |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-multilingual-cased``                           | | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                             |
 |                   |                                                            | | Trained on cased text in the top 104 languages with the largest Wikipedias                                                          |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-chinese``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Chinese Simplified and Traditional text.                                                                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-german-cased``                                 | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased German text by Deepset.ai                                                                                          |
 |                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__).                                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking``                  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | Trained on lower-cased English text using Whole-Word-Masking                                                                        |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking``                    | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | Trained on cased English text using Whole-Word-Masking                                                                              |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
 |                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
 |                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
 |                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
 |                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-german-dbmdz-cased``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased German text by DBMDZ                                                                                               |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-dutch-cased``                                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Dutch text.                                                                                                        |
 |                   |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT-2             | ``gpt2``                                                   | | 12-layer, 768-hidden, 12-heads, 117M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT-2 English model                                                                                                          |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``gpt2-medium``                                            | | 24-layer, 1024-hidden, 16-heads, 345M parameters.                                                                                   |
 |                   |                                                            | | OpenAI's Medium-sized GPT-2 English model                                                                                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``gpt2-large``                                             | | 36-layer, 1280-hidden, 20-heads, 774M parameters.                                                                                   |
 |                   |                                                            | | OpenAI's Large-sized GPT-2 English model                                                                                            |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
 |                   |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Transformer-XL    | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
 |                   |                                                            | | English model trained on wikitext-103                                                                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | XLNet             | ``xlnet-base-cased``                                       | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | XLNet English model                                                                                                                 |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | XLNet Large English model                                                                                                           |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                     |
 |                   |                                                            | | XLM English model                                                                                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-German model trained on the concatenation of English and German wikipedia                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-French model trained on the concatenation of English and French wikipedia                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-tlm-xnli15-1024``                                | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-clm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-French model trained with CLM (Causal Language Modeling) on the concatenation of English and French wikipedia           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
 |                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
 |                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-large``                                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
 |                   |                                                            | | RoBERTa using the BERT-large architecture                                                                                           |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
 |                   |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
 |                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-large-openai-detector``                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
 |                   |                                                            | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                            |
 |                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
 |                   |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
 |                   |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | CamemBERT         | ``camembert-base``                                         | | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                                     |
 |                   |                                                            | | CamemBERT using the BERT-base architecture                                                                                          |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__)                                                 |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model                                                                                                                   |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model                                                                                                                  |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model                                                                                                                 |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model                                                                                                                |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
 |                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
 |                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
 |                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
 |                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
 |                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | XLM-RoBERTa       | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
 |                   |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
 |                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | FlauBERT          | ``flaubert-small-cased``                                   | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
 |                   |                                                            | | FlauBERT small architecture                                                                                                         |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``flaubert-base-uncased``                                  | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``flaubert-base-cased``                                    | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``flaubert-large-cased``                                   | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
 |                   |                                                            | | FlauBERT large architecture                                                                                                         |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 .. <https://huggingface.co/transformers/examples.html>`__
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@ -0,0 +1,315 @@
 # Quickstart
 ## Philosophy
 Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
 The library was designed with two strong goals in mind:
 - be as easy and fast to use as possible:
  - we strongly limited the number of user-facing abstractions to learn, in fact there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
  - all of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and loading the related class from a pretrained instance supplied in the library or your own saved instance.
  - as a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend/build-upon the library, just use regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
 - provide state-of-the-art models with performances as close as possible to the original models:
  - we provide at least one example for each architecture which reproduces a result provided by the official authors of said architecture,
  - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code.
 A few other goals:
 - expose the models' internals as consistently as possible:
  - we give access, using a single API to the full hidden-states and attention weights,
  - tokenizer and base model's API are standardized to easily switch between models.
 - incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
  - simple ways to mask and prune transformer heads.
 ## Main concepts
 The library is build around three type of classes for each models:
 - **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
 - **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
 - **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
 All these classes can be instantiated from pretrained instances and saved locally using two methods:
 - `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
 We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
 - the **MAIN CLASSES** section details the common functionalities/method/attributes of the three main type of classes (configuration, model, tokenizer) plus some optimization related classes provided as utilities for training,
 - the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and in particular the input/output that you should expect when calling each of them.
 ## Quick tour: Usage
 Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
 See full API reference for examples for each model class.
 ### BERT example
 Let's start by preparing a tokenized input (a list of token embeddings indices to be fed to Bert) from a text string using `BertTokenizer`
 ```python
 import torch
 from transformers import BertTokenizer, BertModel, BertForMaskedLM
 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
 logging.basicConfig(level=logging.INFO)
 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 # Tokenize input
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
 tokenized_text = tokenizer.tokenize(text)
 # Mask a token that we will try to predict back with `BertForMaskedLM`
 masked_index = 8
 tokenized_text[masked_index] = '[MASK]'
 assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
 # Convert token to vocabulary indices
 indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
 # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
 segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
 # Convert inputs to PyTorch tensors
 tokens_tensor = torch.tensor([indexed_tokens])
 segments_tensors = torch.tensor([segments_ids])
 ```
 Let's see how we can use `BertModel` to encode our inputs in hidden-states:
 ```python
 # Load pre-trained model (weights)
 model = BertModel.from_pretrained('bert-base-uncased')
 # Set the model in evaluation mode to deactivate the DropOut modules
 # This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
 tokens_tensor = tokens_tensor.to('cuda')
 segments_tensors = segments_tensors.to('cuda')
 model.to('cuda')
 # Predict hidden states features for each layer
 with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
 # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
 assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
 ```
 And how to use `BertForMaskedLM` to predict a masked token:
 ```python
 # Load pre-trained model (weights)
 model = BertForMaskedLM.from_pretrained('bert-base-uncased')
 model.eval()
 # If you have a GPU, put everything on cuda
 tokens_tensor = tokens_tensor.to('cuda')
 segments_tensors = segments_tensors.to('cuda')
 model.to('cuda')
 # Predict all tokens
 with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]
 # confirm we were able to predict 'henson'
 predicted_index = torch.argmax(predictions[0, masked_index]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == 'henson'
 ```
 ### OpenAI GPT-2
 Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt.
 First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
 ```python
 import torch
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
 logging.basicConfig(level=logging.INFO)
 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 # Encode a text inputs
 text = "Who was Jim Henson ? Jim Henson was a"
 indexed_tokens = tokenizer.encode(text)
 # Convert indexed tokens in a PyTorch tensor
 tokens_tensor = torch.tensor([indexed_tokens])
 ```
 Let's see how to use `GPT2LMHeadModel` to generate the next token following our text:
 ```python
 # Load pre-trained model (weights)
 model = GPT2LMHeadModel.from_pretrained('gpt2')
 # Set the model in evaluation mode to deactivate the DropOut modules
 # This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
 tokens_tensor = tokens_tensor.to('cuda')
 model.to('cuda')
 # Predict all tokens
 with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]
 # get the predicted next sub-word (in our case, the word 'man')
 predicted_index = torch.argmax(predictions[0, -1, :]).item()
 predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
 assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
 ```
 Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
 #### Using the past
 GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
 Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):
 ```python
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import torch
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 model = GPT2LMHeadModel.from_pretrained('gpt2')
 generated = tokenizer.encode("The Manhattan bridge")
 context = torch.tensor([generated])
 past = None
 for i in range(100):
    print(i)
    output, past = model(context, past=past)
    token = torch.argmax(output[0, :])
    generated += [token.tolist()]
    context = token.unsqueeze(0)
 sequence = tokenizer.decode(generated)
 print(sequence)
 ```
 The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
 ### Model2Model example
 Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
 ```python
 import torch
 from transformers import BertTokenizer, Model2Model
 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
 logging.basicConfig(level=logging.INFO)
 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 # Encode the input to the encoder (the question)
 question = "Who was Jim Henson?"
 encoded_question = tokenizer.encode(question)
 # Encode the input to the decoder (the answer)
 answer = "Jim Henson was a puppeteer"
 encoded_answer = tokenizer.encode(answer)
 # Convert inputs to PyTorch tensors
 question_tensor = torch.tensor([encoded_question])
 answer_tensor = torch.tensor([encoded_answer])
 ```
 Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
 ```python
 # In order to compute the loss we need to provide language model
 # labels (the token ids that the model should have produced) to
 # the decoder.
 lm_labels =  encoded_answer
 labels_tensor = torch.tensor([lm_labels])
 # Load pre-trained model (weights)
 model = Model2Model.from_pretrained('bert-base-uncased')
 # Set the model in evaluation mode to deactivate the DropOut modules
 # This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
 question_tensor = question_tensor.to('cuda')
 answer_tensor = answer_tensor.to('cuda')
 labels_tensor = labels_tensor.to('cuda')
 model.to('cuda')
 # Predict hidden states features for each layer
 with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the value of the LM loss 
    lm_loss = outputs[0]
 ```
 This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
 ```python
 # Let's re-use the previous question
 question = "Who was Jim Henson?"
 encoded_question = tokenizer.encode(question)
 question_tensor = torch.tensor([encoded_question])
 # This time we try to generate the answer, so we start with an empty sequence
 answer = "[CLS]"
 encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
 answer_tensor = torch.tensor([encoded_answer])
 # Load pre-trained model (weights)
 model = Model2Model.from_pretrained('fine-tuned-weights')
 model.eval()
 # If you have a GPU, put everything on cuda
 question_tensor = encoded_question.to('cuda')
 answer_tensor = encoded_answer.to('cuda')
 model.to('cuda')
 # Predict all tokens
 with torch.no_grad():
    outputs = model(question_tensor, answer_tensor)
    predictions = outputs[0]
 # confirm we were able to predict 'jim'
 predicted_index = torch.argmax(predictions[0, -1]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == 'jim'
 ```
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@ -0,0 +1,190 @@
 Loading Google AI or OpenAI pre-trained weights or PyTorch dump
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ``from_pretrained()`` method
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
 .. code-block:: python
   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
 where
 * ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
 *
  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
  *
    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
    * ``bert-base-german-dbmdz-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
    * ``bert-base-german-dbmdz-uncased``: Trained on (uncased) German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
  *
    a path or url to a pretrained model archive containing:
    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
 *
  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
 * ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
 * ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
 * ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 ``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
 When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
 Examples:
 .. code-block:: python
   # BERT
   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
   # OpenAI GPT
   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
   model = OpenAIGPTModel.from_pretrained('openai-gpt')
   # Transformer-XL
   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
   # OpenAI GPT-2
   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
   model = GPT2Model.from_pretrained('gpt2')
 Cache directory
 ~~~~~~~~~~~~~~~
 ``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
 * ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
 * shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
 * PyTorch cache home + ``/pytorch_pretrained_bert/``
  where PyTorch cache home is defined by (in this order):
  * shell environment variable ``ENV_TORCH_HOME``
  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
  * default: ``~/.cache/torch/``
 Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
 You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
 Serialization best-practices
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
 There are three types of files you need to save to be able to reload a fine-tuned model:
 * the model itself which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
 * the configuration file of the model which is saved as a JSON file, and
 * the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
 The *default filenames* of these files are as follow:
 * the model weights file: ``pytorch_model.bin``\ ,
 * the configuration file: ``config.json``\ ,
 * the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
 * for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
 **If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
 Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
 .. code-block:: python
   from transformers import WEIGHTS_NAME, CONFIG_NAME
   output_dir = "./models/"
   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
   # If we have a distributed model, save only the encapsulated model
   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
   model_to_save = model.module if hasattr(model, 'module') else model
   # If we save using the predefined names, we can load using `from_pretrained`
   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
   output_config_file = os.path.join(output_dir, CONFIG_NAME)
   torch.save(model_to_save.state_dict(), output_model_file)
   model_to_save.config.to_json_file(output_config_file)
   tokenizer.save_vocabulary(output_dir)
   # Step 2: Re-load the saved model and vocabulary
   # Example for a Bert model
   model = BertForQuestionAnswering.from_pretrained(output_dir)
   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
   # Example for a GPT model
   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
 Here is another way you can save and reload the model if you want to use specific paths for each type of files:
 .. code-block:: python
   output_model_file = "./models/my_own_model_file.bin"
   output_config_file = "./models/my_own_config_file.bin"
   output_vocab_file = "./models/my_own_vocab_file.bin"
   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
   # If we have a distributed model, save only the encapsulated model
   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
   model_to_save = model.module if hasattr(model, 'module') else model
   torch.save(model_to_save.state_dict(), output_model_file)
   model_to_save.config.to_json_file(output_config_file)
   tokenizer.save_vocabulary(output_vocab_file)
   # Step 2: Re-load the saved model and vocabulary
   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
   # Here is how to do it in this situation:
   # Example for a Bert model
   config = BertConfig.from_json_file(output_config_file)
   model = BertForQuestionAnswering(config)
   state_dict = torch.load(output_model_file)
   model.load_state_dict(state_dict)
   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
   # Example for a GPT model
   config = OpenAIGPTConfig.from_json_file(output_config_file)
   model = OpenAIGPTDoubleHeadsModel(config)
   state_dict = torch.load(output_model_file)
   model.load_state_dict(state_dict)
   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@ -0,0 +1,135 @@
 TorchScript
 ================================================
 .. note::
    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
    with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
    releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
    with compiled TorchScript.
 According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.
 We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
 be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
 they can be exported, and what to be mindful of when using these models with TorchScript.
 Exporting a model needs two things:
 * dummy inputs to execute a model forward pass.
 * the model needs to be instantiated with the ``torchscript`` flag.
 These necessities imply several things developers should be careful about. These are detailed below.
 Implications
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 TorchScript flag and tied weights
 ------------------------------------------------
 This flag is necessary because most of the language models in this repository have tied weights between their
 ``Embedding`` layer and their ``Decoding`` layer. TorchScript does not allow the export of models that have tied weights,
 it is therefore necessary to untie the weights beforehand.
 This implies that models instantiated with the ``torchscript`` flag have their ``Embedding`` layer and ``Decoding`` layer
 separate, which means that they should not be trained down the line. Training would de-synchronize the two layers,
 leading to unexpected results.
 This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
 can be safely exported without the ``torchscript`` flag.
 Dummy inputs and standard lengths
 ------------------------------------------------
 The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
 Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used
 to create the "trace" of the model.
 The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
 input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
 as:
 ``The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2``
 will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
 input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
 will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
 resulting in more calculations.
 It is recommended to be careful of the total number of operations done on each input and to follow performance closely
 when exporting varying sequence-length models.
 Using TorchScript in Python
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Below are examples of using the Python to save, load models as well as how to use the trace for inference.
 Saving a model
 ------------------------------------------------
 This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated
 according to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt``
 .. code-block:: python
    from transformers import BertModel, BertTokenizer, BertConfig
    import torch
    enc = BertTokenizer.from_pretrained("bert-base-uncased")
    # Tokenizing input text
    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
    tokenized_text = enc.tokenize(text)
    # Masking one of the input tokens
    masked_index = 8
    tokenized_text[masked_index] = '[MASK]'
    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
    # Creating a dummy input
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    dummy_input = [tokens_tensor, segments_tensors]
    # Initializing the model with the torchscript flag
    # Flag set to True even though it is not necessary as this model does not have an LM Head.
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
    # Instantiating the model
    model = BertModel(config)
    # The model needs to be in evaluation mode
    model.eval()
    # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
    model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
    # Creating the trace
    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
    torch.jit.save(traced_model, "traced_bert.pt")
 Loading a model
 ------------------------------------------------
 This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``.
 We are re-using the previously initialised ``dummy_input``.
 .. code-block:: python
    loaded_model = torch.jit.load("traced_model.pt")
    loaded_model.eval()
    all_encoder_layers, pooled_output = loaded_model(dummy_input)
 Using a traced model for inference
 ------------------------------------------------
 Using the traced model for inference is as simple as using its ``__call__`` dunder method:
 .. code-block:: python
    traced_model(tokens_tensor, segments_tensors)
--- a/examples/README.md
+++ b/examples/README.md
@ -0,0 +1,801 @@
 # Examples
 In this section a few examples are put together. All of these examples work for several models, making use of the very
 similar API between the different models.
 **Important**  
 To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
 Execute the following steps in a new virtual environment:
 ```bash
 git clone https://github.com/huggingface/transformers
 cd transformers
 pip install .
 pip install -r ./examples/requirements.txt
 ```
 | Section                    | Description                                                                                                                                                |
 |----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. 
 | [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
 | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
 | [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training.                                                                                  |
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
 | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
 | [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language
 inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |
 ## TensorFlow 2.0 Bert models on GLUE
 Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).
 Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
 This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
 Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
 These options and the below benchmark are provided by @tlkh.
 Quick benchmarks from the script (no other modifications):
 | GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
 | --------- | -------- | ----------------------- | ----------------------|
 | Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
 | Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
 | V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
 | V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
 | 1080 Ti | FP32 | 55s | - |
 Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
 ## Language model fine-tuning
 Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
 Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
 to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
 are fine-tuned using a masked language modeling (MLM) loss.
 Before running the following example, you should get a file that contains text on which the language model will be
 fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
 We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
 text that will be used for evaluation.
 ### GPT-2/GPT and causal language modeling
 The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
 the tokenization). The loss here is that of causal language modeling.
 ```bash
 export TRAIN_FILE=/path/to/dataset/wiki.train.raw
 export TEST_FILE=/path/to/dataset/wiki.test.raw
 python run_lm_finetuning.py \
    --output_dir=output \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
    --do_train \
    --train_data_file=$TRAIN_FILE \
    --do_eval \
    --eval_data_file=$TEST_FILE
 ```
 This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
 a score of ~20 perplexity once fine-tuned on the dataset.
 ### RoBERTa/BERT and masked language modeling
 The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
 as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
 pre-training: masked language modeling. 
 In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
 slightly slower (over-fitting takes more epochs).
 We use the `--mlm` flag so that the script may change its loss function.
 ```bash
 export TRAIN_FILE=/path/to/dataset/wiki.train.raw
 export TEST_FILE=/path/to/dataset/wiki.test.raw
 python run_lm_finetuning.py \
    --output_dir=output \
    --model_type=roberta \
    --model_name_or_path=roberta-base \
    --do_train \
    --train_data_file=$TRAIN_FILE \
    --do_eval \
    --eval_data_file=$TEST_FILE \
    --mlm
 ```
 ## Language generation
 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.
 Example usage:
 ```bash
 python run_generation.py \
    --model_type=gpt2 \
    --model_name_or_path=gpt2
 ```
 ## GLUE
 Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
 Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
 Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
 GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
 uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran single V100 GPUs with a total train
 batch sizes between 16 and 64. Some of these tasks have a small dataset and training can lead to high variance in the results
 between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
 | Task  | Metric                       | Result      |
 |-------|------------------------------|-------------|
 | CoLA  | Matthew's corr               | 49.23       |
 | SST-2 | Accuracy                     | 91.97       |
 | MRPC  | F1/Accuracy                  | 89.47/85.29 |
 | STS-B | Person/Spearman corr.        | 83.95/83.70 |
 | QQP   | Accuracy/F1                  | 88.40/84.31 |
 | MNLI  | Matched acc./Mismatched acc. | 80.61/81.08 |
 | QNLI  | Accuracy                     | 87.46       |
 | RTE   | Accuracy                     | 61.73       |
 | WNLI  | Accuracy                     | 45.07       |
 Some of these results are significantly different from the ones reported on the test set
 of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
 Before running anyone of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`.
 ```bash
 export GLUE_DIR=/path/to/glue
 export TASK_NAME=MRPC
 python run_glue.py \
  --model_type bert \
  --model_name_or_path bert-base-cased \
  --task_name $TASK_NAME \
  --do_train \
  --do_eval \
  --do_lower_case \
  --data_dir $GLUE_DIR/$TASK_NAME \
  --max_seq_length 128 \
  --per_gpu_train_batch_size 32 \
  --learning_rate 2e-5 \
  --num_train_epochs 3.0 \
  --output_dir /tmp/$TASK_NAME/
 ```
 where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
 The dev set results will be present within the text file `eval_results.txt` in the specified output_dir. 
 In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate 
 output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
 The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, 
 CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being 
 said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well, 
 since the data processor for each task inherits from the base class DataProcessor.
 ### MRPC
 #### Fine-tuning example
 The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less 
 than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
 Before running anyone of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`.
 ```bash
 export GLUE_DIR=/path/to/glue
 python run_glue.py \
  --model_type bert \
  --model_name_or_path bert-base-cased \
  --task_name MRPC \
  --do_train \
  --do_eval \
  --do_lower_case \
  --data_dir $GLUE_DIR/MRPC/ \
  --max_seq_length 128 \
  --per_gpu_train_batch_size 32 \
  --learning_rate 2e-5 \
  --num_train_epochs 3.0 \
  --output_dir /tmp/mrpc_output/
 ```
 Our test ran on a few seeds with [the original implementation hyper-
 parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation 
 results between 84% and 88%.
 #### Using Apex and mixed-precision
 Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install 
 [apex](https://github.com/NVIDIA/apex), then run the following example:
 ```bash
 export GLUE_DIR=/path/to/glue
 python run_glue.py \
  --model_type bert \
  --model_name_or_path bert-base-cased \
  --task_name MRPC \
  --do_train \
  --do_eval \
  --do_lower_case \
  --data_dir $GLUE_DIR/MRPC/ \
  --max_seq_length 128 \
  --per_gpu_train_batch_size 32 \
  --learning_rate 2e-5 \
  --num_train_epochs 3.0 \
  --output_dir /tmp/mrpc_output/ \
  --fp16
 ```
 #### Distributed training
 Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
 reaches F1 > 92 on MRPC.
 ```bash
 export GLUE_DIR=/path/to/glue
 python -m torch.distributed.launch \
    --nproc_per_node 8 run_glue.py \
    --model_type bert \
    --model_name_or_path bert-base-cased \
    --task_name MRPC \
    --do_train \
    --do_eval \
    --do_lower_case \
    --data_dir $GLUE_DIR/MRPC/ \
    --max_seq_length 128 \
    --per_gpu_train_batch_size 8 \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --output_dir /tmp/mrpc_output/
 ```
 Training with these hyper-parameters gave us the following results:
 ```bash
 acc = 0.8823529411764706
 acc_and_f1 = 0.901702786377709
 eval_loss = 0.3418912578906332
 f1 = 0.9210526315789473
 global_step = 174
 loss = 0.07231863956341798
 ```
 ### MNLI
 The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
 ```bash
 export GLUE_DIR=/path/to/glue
 python -m torch.distributed.launch \
    --nproc_per_node 8 run_glue.py \
    --model_type bert \
    --model_name_or_path bert-base-cased \
    --task_name mnli \
    --do_train \
    --do_eval \
    --do_lower_case \
    --data_dir $GLUE_DIR/MNLI/ \
    --max_seq_length 128 \
    --per_gpu_train_batch_size 8 \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --output_dir output_dir \
 ```
 The results  are the following:
 ```bash
 ***** Eval results *****
  acc = 0.8679706601466992
  eval_loss = 0.4911287787382479
  global_step = 18408
  loss = 0.04755385363816904
 ***** Eval results *****
  acc = 0.8747965825874695
  eval_loss = 0.45516540421714036
  global_step = 18408
  loss = 0.04755385363816904
 ```
 ## Multiple Choice
 Based on the script [`run_multiple_choice.py`]().
 #### Fine-tuning on SWAG
 Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
 ```bash
 #training on 4 tesla V100(16GB) GPUS
 export SWAG_DIR=/path/to/swag_data_dir
 python ./examples/run_multiple_choice.py \
 --model_type roberta \
 --task_name swag \
 --model_name_or_path roberta-base \
 --do_train \
 --do_eval \
 --do_lower_case \
 --data_dir $SWAG_DIR \
 --learning_rate 5e-5 \
 --num_train_epochs 3 \
 --max_seq_length 80 \
 --output_dir models_bert/swag_base \
 --per_gpu_eval_batch_size=16 \
 --per_gpu_train_batch_size=16 \
 --gradient_accumulation_steps 2 \
 --overwrite_output
 ```
 Training with the defined hyper-parameters yields the following results:
 ```
 ***** Eval results *****
 eval_acc = 0.8338998300509847
 eval_loss = 0.44457291918821606
 ```
 ## SQuAD
 Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
 #### Fine-tuning BERT on SQuAD1.0
 This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
 on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a 
 $SQUAD_DIR directory.
 * [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
 * [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
 * [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
 And for SQuAD2.0, you need to download:
 - [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
 - [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
 - [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 python run_squad.py \
  --model_type bert \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --do_lower_case \
  --train_file $SQUAD_DIR/train-v1.1.json \
  --predict_file $SQUAD_DIR/dev-v1.1.json \
  --per_gpu_train_batch_size 12 \
  --learning_rate 3e-5 \
  --num_train_epochs 2.0 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /tmp/debug_squad/
 ```
 Training with the previously defined hyper-parameters yields the following results:
 ```bash
 f1 = 88.52
 exact_match = 81.22
 ```
 #### Distributed training
 Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
 ```bash
 python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
    --model_type bert \
    --model_name_or_path bert-large-uncased-whole-word-masking \
    --do_train \
    --do_eval \
    --do_lower_case \
    --train_file $SQUAD_DIR/train-v1.1.json \
    --predict_file $SQUAD_DIR/dev-v1.1.json \
    --learning_rate 3e-5 \
    --num_train_epochs 2 \
    --max_seq_length 384 \
    --doc_stride 128 \
    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
    --per_gpu_eval_batch_size=3   \
    --per_gpu_train_batch_size=3   \
 ```
 Training with the previously defined hyper-parameters yields the following results:
 ```bash
 f1 = 93.15
 exact_match = 86.91
 ```
 This fine-tuned model is available as a checkpoint under the reference
 `bert-large-uncased-whole-word-masking-finetuned-squad`.
 #### Fine-tuning XLNet on SQuAD
 This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
 ##### Command for SQuAD1.0:
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 python /data/home/hlu/transformers/examples/run_squad.py \
    --model_type xlnet \
    --model_name_or_path xlnet-large-cased \
    --do_train \
    --do_eval \
    --do_lower_case \
    --train_file /data/home/hlu/notebooks/NLP/examples/question_answering/train-v1.1.json \
    --predict_file /data/home/hlu/notebooks/NLP/examples/question_answering/dev-v1.1.json \
    --learning_rate 3e-5 \
    --num_train_epochs 2 \
    --max_seq_length 384 \
    --doc_stride 128 \
    --output_dir ./wwm_cased_finetuned_squad/ \
    --per_gpu_eval_batch_size=4  \
    --per_gpu_train_batch_size=4   \
    --save_steps 5000
 ```
 ##### Command for SQuAD2.0:
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 python run_squad.py \
    --model_type xlnet \
    --model_name_or_path xlnet-large-cased \
    --do_train \
    --do_eval \
    --version_2_with_negative \
    --train_file $SQUAD_DIR/train-v2.0.json \
    --predict_file $SQUAD_DIR/dev-v2.0.json \
    --learning_rate 3e-5 \
    --num_train_epochs 4 \
    --max_seq_length 384 \
    --doc_stride 128 \
    --output_dir ./wwm_cased_finetuned_squad/ \
    --per_gpu_eval_batch_size=2  \
    --per_gpu_train_batch_size=2   \
    --save_steps 5000
 ```
 Larger batch size may improve the performance while costing more memory.
 ##### Results for SQuAD1.0 with the previously defined hyper-parameters:
 ```python
 {
 "exact": 85.45884578997162,
 "f1": 92.5974600601065,
 "total": 10570,
 "HasAns_exact": 85.45884578997162,
 "HasAns_f1": 92.59746006010651,
 "HasAns_total": 10570
 }
 ```
 ##### Results for SQuAD2.0 with the previously defined hyper-parameters:
 ```python
 {
 "exact": 80.4177545691906,
 "f1": 84.07154997729623,
 "total": 11873,
 "HasAns_exact": 76.73751686909581,
 "HasAns_f1": 84.05558584352873,
 "HasAns_total": 5928,
 "NoAns_exact": 84.0874684608915,
 "NoAns_f1": 84.0874684608915,
 "NoAns_total": 5945
 }
 ```
 ## Named Entity Recognition
 Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
 [`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
 This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
 Details and results for the fine-tuning provided by @stefan-it.
 ### Data (Download and pre-processing steps)
 Data can be obtained from the [GermEval 2014](https://sites.google.com/site/germeval2014ner/data) shared task page.
 Here are the commands for downloading and pre-processing train, dev and test datasets. The original data format has four (tab-separated) columns, in a pre-processing step only the two relevant columns (token and outer span NER annotation) are extracted:
 ```bash
 curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
 | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
 curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
 | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
 curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
 | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
 ```
 The GermEval 2014 dataset contains some strange "control character" tokens like `'\x96', '\u200e', '\x95', '\xad' or '\x80'`. One problem with these tokens is, that `BertTokenizer` returns an empty token for them, resulting in misaligned `InputExample`s. I wrote a script that a) filters these tokens and b) splits longer sentences into smaller ones (once the max. subtoken length is reached).
 ```bash
 wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
 ```
 Let's define some variables that we need for further pre-processing steps and training the model:
 ```bash
 export MAX_LENGTH=128
 export BERT_MODEL=bert-base-multilingual-cased
 ```
 Run the pre-processing script on training, dev and test datasets:
 ```bash
 python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
 python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
 python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
 ```
 The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so an own set of labels must be used:
 ```bash
 cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
 ```
 ### Prepare the run
 Additional environment variables must be set:
 ```bash
 export OUTPUT_DIR=germeval-model
 export BATCH_SIZE=32
 export NUM_EPOCHS=3
 export SAVE_STEPS=750
 export SEED=1
 ```
 ### Run the Pytorch version
 To start training, just run:
 ```bash
 python3 run_ner.py --data_dir ./ \
 --model_type bert \
 --labels ./labels.txt \
 --model_name_or_path $BERT_MODEL \
 --output_dir $OUTPUT_DIR \
 --max_seq_length  $MAX_LENGTH \
 --num_train_epochs $NUM_EPOCHS \
 --per_gpu_train_batch_size $BATCH_SIZE \
 --save_steps $SAVE_STEPS \
 --seed $SEED \
 --do_train \
 --do_eval \
 --do_predict
 ```
 If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
 #### Evaluation
 Evaluation on development dataset outputs the following for our example:
 ```bash
 10/04/2019 00:42:06 - INFO - __main__ -   ***** Eval results  *****
 10/04/2019 00:42:06 - INFO - __main__ -     f1 = 0.8623348017621146
 10/04/2019 00:42:06 - INFO - __main__ -     loss = 0.07183869666975543
 10/04/2019 00:42:06 - INFO - __main__ -     precision = 0.8467916366258111
 10/04/2019 00:42:06 - INFO - __main__ -     recall = 0.8784592370979806
 ```
 On the test dataset the following results could be achieved:
 ```bash
 10/04/2019 00:42:42 - INFO - __main__ -   ***** Eval results  *****
 10/04/2019 00:42:42 - INFO - __main__ -     f1 = 0.8614389652384803
 10/04/2019 00:42:42 - INFO - __main__ -     loss = 0.07064602487454782
 10/04/2019 00:42:42 - INFO - __main__ -     precision = 0.8604651162790697
 10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
 ```
 #### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
 Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
 | Model | F-Score Dev | F-Score Test
 | --------------------------------- | ------- | --------
 | `bert-large-cased`            | 95.59 | 91.70
 | `roberta-large`                  | 95.96 | 91.87
 | `distilbert-base-uncased` | 94.34 | 90.32
 ### Run the Tensorflow 2 version
 To start training, just run:
 ```bash
 python3 run_tf_ner.py --data_dir ./ \
 --model_type bert \
 --labels ./labels.txt \
 --model_name_or_path $BERT_MODEL \
 --output_dir $OUTPUT_DIR \
 --max_seq_length  $MAX_LENGTH \
 --num_train_epochs $NUM_EPOCHS \
 --per_device_train_batch_size $BATCH_SIZE \
 --save_steps $SAVE_STEPS \
 --seed $SEED \
 --do_train \
 --do_eval \
 --do_predict
 ```
 Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
 #### Evaluation
 Evaluation on development dataset outputs the following for our example:
 ```bash
           precision    recall  f1-score   support
 LOCderiv     0.7619    0.6154    0.6809        52
  PERpart     0.8724    0.8997    0.8858      4057
  OTHpart     0.9360    0.9466    0.9413       711
  ORGpart     0.7015    0.6989    0.7002       269
  LOCpart     0.7668    0.8488    0.8057       496
      LOC     0.8745    0.9191    0.8963       235
 ORGderiv     0.7723    0.8571    0.8125        91
 OTHderiv     0.4800    0.6667    0.5581        18
      OTH     0.5789    0.6875    0.6286        16
 PERderiv     0.5385    0.3889    0.4516        18
      PER     0.5000    0.5000    0.5000         2
      ORG     0.0000    0.0000    0.0000         3
 micro avg     0.8574    0.8862    0.8715      5968
 macro avg     0.8575    0.8862    0.8713      5968
 ```
 On the test dataset the following results could be achieved:
 ```bash
           precision    recall  f1-score   support
  PERpart     0.8847    0.8944    0.8896      9397
  OTHpart     0.9376    0.9353    0.9365      1639
  ORGpart     0.7307    0.7044    0.7173       697
      LOC     0.9133    0.9394    0.9262       561
  LOCpart     0.8058    0.8157    0.8107      1150
      ORG     0.0000    0.0000    0.0000         8
 OTHderiv     0.5882    0.4762    0.5263        42
 PERderiv     0.6571    0.5227    0.5823        44
      OTH     0.4906    0.6667    0.5652        39
 ORGderiv     0.7016    0.7791    0.7383       172
 LOCderiv     0.8256    0.6514    0.7282       109
      PER     0.0000    0.0000    0.0000        11
 micro avg     0.8722    0.8774    0.8748     13869
 macro avg     0.8712    0.8774    0.8740     13869
 ```
 ## XNLI
 Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
 [XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
 #### Fine-tuning on XNLI
 This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
 on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a 
 `$XNLI_DIR` directory.
 * [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
 * [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
 ```bash
 export XNLI_DIR=/path/to/XNLI
 python run_xnli.py \
  --model_type bert \
  --model_name_or_path bert-base-multilingual-cased \
  --language de \
  --train_language en \
  --do_train \
  --do_eval \
  --data_dir $XNLI_DIR \
  --per_gpu_train_batch_size 32 \
  --learning_rate 5e-5 \
  --num_train_epochs 2.0 \
  --max_seq_length 128 \
  --output_dir /tmp/debug_xnli/ \
  --save_steps -1
 ```
 Training with the previously defined hyper-parameters yields the following results on the **test** set:
 ```bash
 acc = 0.7093812375249501
 ```
 ## MM-IMDb
 Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/mm-imdb/run_mmimdb.py).
 [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
 ### Training on MM-IMDb
 ```
 python run_mmimdb.py \
    --data_dir /path/to/mmimdb/dataset/ \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --output_dir /path/to/save/dir/ \
    --do_train \
    --do_eval \
    --max_seq_len 512 \
    --gradient_accumulation_steps 20 \
    --num_image_embeds 3 \
    --num_train_epochs 100 \
    --patience 5
 ```
 ## Adversarial evaluation of model performances
 Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
 The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
 This is an example of using test_hans.py:
 ```bash
 export HANS_DIR=path-to-hans
 export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
 export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
 python examples/test_hans.py \
        --task_name hans \
        --model_type $MODEL_TYPE \
        --do_eval \
        --do_lower_case \
        --data_dir $HANS_DIR \
        --model_name_or_path $MODEL_PATH \
        --max_seq_length 128 \
        -output_dir $MODEL_PATH \
 ```
 This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
 The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
 ```bash
 Heuristic entailed results:
 lexical_overlap: 0.9702
 subsequence: 0.9942
 constituent: 0.9962
 Heuristic non-entailed results:
 lexical_overlap: 0.199
 subsequence: 0.0396
 constituent: 0.118
 ```
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@ -0,0 +1,531 @@
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Benchmarking the library on inference and training """
 # If checking the tensors placement
 # tf.debugging.set_log_device_placement(True)
 import argparse
 import csv
 import timeit
 from time import time
 from typing import List
 from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available
 if is_tf_available():
    import tensorflow as tf
    from transformers import TFAutoModel
 if is_torch_available():
    import torch
    from transformers import AutoModel
 input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
 the Director of Hatcheries and Conditioning entered the room, in the
 scarcely breathing silence, the absent-minded, soliloquizing hum or
 whistle, of absorbed concentration. A troop of newly arrived students,
 very young, pink and callow, followed nervously, rather abjectly, at the
 Director's heels. Each of them carried a notebook, in which, whenever
 the great man spoke, he desperately scribbled. Straight from the
 horse's mouth. It was a rare privilege. The D. H. C. for Central London
 always made a point of personally conducting his new students round
 the various departments.
 "Just to give you a general idea," he would explain to them. For of
 course some sort of general idea they must have, if they were to do
 their work intelligently-though as little of one, if they were to be good
 and happy members of society, as possible. For particulars, as every
 one knows, make for virtue and happiness; generalities are intellectu-
 ally necessary evils. Not philosophers but fret-sawyers and stamp col-
 lectors compose the backbone of society.
 "To-morrow," he would add, smiling at them with a slightly menacing
 geniality, "you'll be settling down to serious work. You won't have time
 for generalities. Meanwhile ..."
 Meanwhile, it was a privilege. Straight from the horse's mouth into the
 notebook. The boys scribbled like mad.
 Tall and rather thin but upright, the Director advanced into the room.
 He had a long chin and big rather prominent teeth, just covered, when
 he was not talking, by his full, floridly curved lips. Old, young? Thirty?
 Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
 arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
 "I shall begin at the beginning," said the D.H.C. and the more zealous
 students recorded his intention in their notebooks: Begin at the begin-
 ning. "These," he waved his hand, "are the incubators." And opening
 an insulated door he showed them racks upon racks of numbered test-
 tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
 whereas the male gametes," and here he opened another door, "they
 have to be kept at thirty-five instead of thirty-seven. Full blood heat
 sterilizes." Rams wrapped in theremogene beget no lambs.
 Still leaning against the incubators he gave them, while the pencils
 scurried illegibly across the pages, a brief description of the modern
 fertilizing process; spoke first, of course, of its surgical introduc-
 tion-"the operation undergone voluntarily for the good of Society, not
 to mention the fact that it carries a bonus amounting to six months'
 salary"; continued with some account of the technique for preserving
 the excised ovary alive and actively developing; passed on to a consid-
 eration of optimum temperature, salinity, viscosity; referred to the liq-
 uor in which the detached and ripened eggs were kept; and, leading
 his charges to the work tables, actually showed them how this liquor
 was drawn off from the test-tubes; how it was let out drop by drop
 onto the specially warmed slides of the microscopes; how the eggs
 which it contained were inspected for abnormalities, counted and
 transferred to a porous receptacle; how (and he now took them to
 watch the operation) this receptacle was immersed in a warm bouillon
 containing free-swimming spermatozoa-at a minimum concentration
 of one hundred thousand per cubic centimetre, he insisted; and how,
 after ten minutes, the container was lifted out of the liquor and its
 contents re-examined; how, if any of the eggs remained unfertilized, it
 was again immersed, and, if necessary, yet again; how the fertilized
 ova went back to the incubators; where the Alphas and Betas re-
 mained until definitely bottled; while the Gammas, Deltas and Epsilons
 were brought out again, after only thirty-six hours, to undergo Bo-
 kanovsky's Process.
 "Bokanovsky's Process," repeated the Director, and the students un-
 derlined the words in their little notebooks.
 One egg, one embryo, one adult-normality. But a bokanovskified egg
 will bud, will proliferate, will divide. From eight to ninety-six buds, and
 every bud will grow into a perfectly formed embryo, and every embryo
 into a full-sized adult. Making ninety-six human beings grow where
 only one grew before. Progress.
 "Essentially," the D.H.C. concluded, "bokanovskification consists of a
 series of arrests of development. We check the normal growth and,
 paradoxically enough, the egg responds by budding."
 Responds by budding. The pencils were busy.
 He pointed. On a very slowly moving band a rack-full of test-tubes was
 entering a large metal box, another, rack-full was emerging. Machinery
 faintly purred. It took eight minutes for the tubes to go through, he
 told them. Eight minutes of hard X-rays being about as much as an
 egg can stand. A few died; of the rest, the least susceptible divided
 into two; most put out four buds; some eight; all were returned to the
 incubators, where the buds began to develop; then, after two days,
 were suddenly chilled, chilled and checked. Two, four, eight, the buds
 in their turn budded; and having budded were dosed almost to death
 with alcohol; consequently burgeoned again and having budded-bud
 out of bud out of bud-were thereafter-further arrest being generally
 fatal-left to develop in peace. By which time the original egg was in a
 fair way to becoming anything from eight to ninety-six embryos- a
 prodigious improvement, you will agree, on nature. Identical twins-but
 not in piddling twos and threes as in the old viviparous days, when an
 egg would sometimes accidentally divide; actually by dozens, by
 scores at a time.
 "Scores," the Director repeated and flung out his arms, as though he
 were distributing largesse. "Scores."
 But one of the students was fool enough to ask where the advantage
 lay.
 "My good boy!" The Director wheeled sharply round on him. "Can't you
 see? Can't you see?" He raised a hand; his expression was solemn.
 "Bokanovsky's Process is one of the major instruments of social stabil-
 ity!"
 Major instruments of social stability.
 Standard men and women; in uniform batches. The whole of a small
 factory staffed with the products of a single bokanovskified egg.
 "Ninety-six identical twins working ninety-six identical machines!" The
 voice was almost tremulous with enthusiasm. "You really know where
 you are. For the first time in history." He quoted the planetary motto.
 "Community, Identity, Stability." Grand words. "If we could bo-
 kanovskify indefinitely the whole problem would be solved."
 Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
 lions of identical twins. The principle of mass production at last applied
 to biology.
 "But, alas," the Director shook his head, "we can't bokanovskify indefi-
 nitely."
 Ninety-six seemed to be the limit; seventy-two a good average. From
 the same ovary and with gametes of the same male to manufacture as
 many batches of identical twins as possible-that was the best (sadly a
 second best) that they could do. And even that was difficult.
 "For in nature it takes thirty years for two hundred eggs to reach ma-
 turity. But our business is to stabilize the population at this moment,
 here and now. Dribbling out twins over a quarter of a century-what
 would be the use of that?"
 Obviously, no use at all. But Podsnap's Technique had immensely ac-
 celerated the process of ripening. They could make sure of at least a
 hundred and fifty mature eggs within two years. Fertilize and bo-
 kanovskify-in other words, multiply by seventy-two-and you get an
 average of nearly eleven thousand brothers and sisters in a hundred
 and fifty batches of identical twins, all within two years of the same
 age.
 "And in exceptional cases we can make one ovary yield us over fifteen
 thousand adult individuals."
 Beckoning to a fair-haired, ruddy young man who happened to be
 passing at the moment. "Mr. Foster," he called. The ruddy young man
 approached. "Can you tell us the record for a single ovary, Mr. Foster?"
 "Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
 out hesitation. He spoke very quickly, had a vivacious blue eye, and
 took an evident pleasure in quoting figures. "Sixteen thousand and
 twelve; in one hundred and eighty-nine batches of identicals. But of
 course they've done much better," he rattled on, "in some of the tropi-
 cal Centres. Singapore has often produced over sixteen thousand five
 hundred; and Mombasa has actually touched the seventeen thousand
 mark. But then they have unfair advantages. You should see the way a
 negro ovary responds to pituitary! It's quite astonishing, when you're
 used to working with European material. Still," he added, with a laugh
 (but the light of combat was in his eyes and the lift of his chin was
 challenging), "still, we mean to beat them if we can. I'm working on a
 wonderful Delta-Minus ovary at this moment. Only just eighteen
 months old. Over twelve thousand seven hundred children already, ei-
 ther decanted or in embryo. And still going strong. We'll beat them
 yet."
 "That's the spirit I like!" cried the Director, and clapped Mr. Foster on
 the shoulder. "Come along with us, and give these boys the benefit of
 your expert knowledge."
 Mr. Foster smiled modestly. "With pleasure." They went.
 In the Bottling Room all was harmonious bustle and ordered activity.
 Flaps of fresh sow's peritoneum ready cut to the proper size came
 shooting up in little lifts from the Organ Store in the sub-basement.
 Whizz and then, click! the lift-hatches hew open; the bottle-liner had
 only to reach out a hand, take the flap, insert, smooth-down, and be-
 fore the lined bottle had had time to travel out of reach along the end-
 less band, whizz, click! another flap of peritoneum had shot up from
 the depths, ready to be slipped into yet another bottle, the next of that
 slow interminable procession on the band.
 Next to the Liners stood the Matriculators. The procession advanced;
 one by one the eggs were transferred from their test-tubes to the
 larger containers; deftly the peritoneal lining was slit, the morula
 dropped into place, the saline solution poured in ... and already the
 bottle had passed, and it was the turn of the labellers. Heredity, date
 of fertilization, membership of Bokanovsky Group-details were trans-
 ferred from test-tube to bottle. No longer anonymous, but named,
 identified, the procession marched slowly on; on through an opening in
 the wall, slowly on into the Social Predestination Room.
 "Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
 as they entered."""
 def create_setup_and_compute(
    model_names: List[str],
    gpu: bool = True,
    tensorflow: bool = False,
    average_over: int = 3,
    torchscript: bool = False,
    xla: bool = False,
    amp: bool = False,
    fp16: bool = False,
    save_to_csv: bool = False,
    csv_filename: str = f"results_{round(time())}.csv",
 ):
    if xla:
        tf.config.optimizer.set_jit(True)
    if amp:
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
    if tensorflow:
        dictionary = {model_name: {} for model_name in model_names}
        results = _compute_tensorflow(model_names, dictionary, average_over, amp)
    else:
        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
        dictionary = {model_name: {} for model_name in model_names}
        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
    print("=========== RESULTS ===========")
    for model_name in model_names:
        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
        for batch_size in results[model_name]["bs"]:
            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
            for slice_size in results[model_name]["ss"]:
                result = results[model_name]["results"][batch_size][slice_size]
                if isinstance(result, str):
                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
                else:
                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
    if save_to_csv:
        with open(csv_filename, mode="w") as csv_file:
            fieldnames = [
                "model",
                "1x8",
                "1x64",
                "1x128",
                "1x256",
                "1x512",
                "1x1024",
                "2x8",
                "2x64",
                "2x128",
                "2x256",
                "2x512",
                "2x1024",
                "4x8",
                "4x64",
                "4x128",
                "4x256",
                "4x512",
                "4x1024",
                "8x8",
                "8x64",
                "8x128",
                "8x256",
                "8x512",
                "8x1024",
            ]
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for model_name in model_names:
                model_results = {
                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
                    for bs in results[model_name]["results"]
                    for ss in results[model_name]["results"][bs]
                }
                writer.writerow({"model": model_name, **model_results})
 def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
        model = AutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]
        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
        for batch_size in batch_sizes:
            if fp16:
                model.half()
            model.to(device)
            model.eval()
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
                    try:
                        if torchscript:
                            print("Tracing model with sequence size", sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            inference(sequence)
                        print("Going through model with sequence of shape", sequence.shape)
                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
    return dictionary
 def _compute_tensorflow(model_names, dictionary, average_over, amp):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name)
        model = TFAutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]
        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
        print("Using model", model)
        @tf.function
        def inference(inputs):
            return model(inputs)
        for batch_size in batch_sizes:
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
                else:
                    sequence = tf.stack(
                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
                    )
                    try:
                        print("Going through model with sequence of shape", sequence.shape)
                        # To make sure that the model is traced + that the tensors are on the appropriate device
                        inference(sequence)
                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
                    except tf.errors.ResourceExhaustedError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
    return dictionary
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--models",
        required=False,
        type=str,
        default="all",
        help="Model checkpoints to be provided "
        "to the AutoModel classes. Leave "
        "blank to benchmark the base version "
        "of all available model "
        "architectures.",
    )
    parser.add_argument(
        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
    )
    parser.add_argument(
        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
    )
    parser.add_argument(
        "--torchscript",
        required=False,
        action="store_true",
        help="Pytorch only: trace the models " "using torchscript",
    )
    parser.add_argument(
        "--tensorflow",
        required=False,
        action="store_true",
        help="Benchmark the TensorFlow version "
        "of the models. Will run on GPU if "
        "the correct dependencies are "
        "installed",
    )
    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
    parser.add_argument(
        "--amp",
        required=False,
        action="store_true",
        help="TensorFlow only: use automatic mixed precision acceleration.",
    )
    parser.add_argument(
        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
    )
    parser.add_argument(
        "--keras_predict",
        required=False,
        action="store_true",
        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
    )
    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
    parser.add_argument(
        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
    )
    parser.add_argument(
        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
    )
    args = parser.parse_args()
    if args.models == "all":
        args.models = [
            "gpt2",
            "bert-base-cased",
            "xlnet-base-cased",
            "xlm-mlm-en-2048",
            "transfo-xl-wt103",
            "openai-gpt",
            "distilbert-base-uncased",
            "distilgpt2",
            "roberta-base",
            "ctrl",
        ]
    else:
        args.models = args.models.split()
    print("Running with arguments", args)
    if args.torch:
        if is_torch_available():
            create_setup_and_compute(
                model_names=args.models,
                tensorflow=False,
                gpu=args.torch_cuda,
                torchscript=args.torchscript,
                fp16=args.fp16,
                save_to_csv=args.save_to_csv,
                csv_filename=args.csv_filename,
                average_over=args.average_over,
            )
        else:
            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
    if args.tensorflow:
        if is_tf_available():
            create_setup_and_compute(
                model_names=args.models,
                tensorflow=True,
                xla=args.xla,
                amp=args.amp,
                save_to_csv=args.save_to_csv,
                csv_filename=args.csv_filename,
                average_over=args.average_over,
            )
        else:
            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
 if __name__ == "__main__":
    main()
--- a/examples/contrib/README.md
+++ b/examples/contrib/README.md
@ -0,0 +1,5 @@
 # Community contributed examples
 This folder contains examples which are not actively maintained (mostly contributed by the community).
 Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@ -0,0 +1,43 @@
 import torch
 from transformers.modeling_camembert import CamembertForMaskedLM
 from transformers.tokenization_camembert import CamembertTokenizer
 def fill_mask(masked_input, model, tokenizer, topk=5):
    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
    assert masked_input.count("<mask>") == 1
    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
    logits = logits[0, masked_index, :]
    prob = logits.softmax(dim=0)
    values, indices = prob.topk(k=topk, dim=0)
    topk_predicted_token_bpe = " ".join(
        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
    )
    masked_token = tokenizer.mask_token
    topk_filled_outputs = []
    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
        predicted_token = predicted_token_bpe.replace("\u2581", " ")
        if " {0}".format(masked_token) in masked_input:
            topk_filled_outputs.append(
                (
                    masked_input.replace(" {0}".format(masked_token), predicted_token),
                    values[index].item(),
                    predicted_token,
                )
            )
        else:
            topk_filled_outputs.append(
                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
            )
    return topk_filled_outputs
 tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
 model = CamembertForMaskedLM.from_pretrained("camembert-base")
 model.eval()
 masked_input = "Le camembert est <mask> :)"
 print(fill_mask(masked_input, model, tokenizer, topk=3))
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -17,43 +17,59 @@
    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
-    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
+    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:
        python run_openai_gpt.py \
          --model_name openai-gpt \
          --do_train \
          --do_eval \
          --train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
          --eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
          --output_dir ../log \
          --train_batch_size 16 \
 """
 import argparse
 import os
 import csv
 import random
 import logging
-from tqdm import tqdm, trange
+import os
 import random
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-                              TensorDataset)
+from tqdm import tqdm, trange
-from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
+from transformers import (
    CONFIG_NAME,
    WEIGHTS_NAME,
    AdamW,
    OpenAIGPTDoubleHeadsModel,
    OpenAIGPTTokenizer,
    get_linear_schedule_with_warmup,
 )
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+logging.basicConfig(
-                    datefmt = '%m/%d/%Y %H:%M:%S',
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-                    level = logging.INFO)
+)
 logger = logging.getLogger(__name__)
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)
 def load_rocstories_dataset(dataset_path):
    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
-    with open(dataset_path, encoding='utf_8') as f:
+    with open(dataset_path, encoding="utf_8") as f:
        f = csv.reader(f)
        output = []
-        next(f) # skip the first line
+        next(f)  # skip the first line
        for line in tqdm(f):
-            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
    return output
 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
@ -65,52 +81,73 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
        n_batch = len(dataset)
        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
-        lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
        mc_labels = np.zeros((n_batch,), dtype=np.int64)
        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
-            input_ids[i, 0, :len(with_cont1)] = with_cont1
+            input_ids[i, 0, : len(with_cont1)] = with_cont1
-            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            input_ids[i, 1, : len(with_cont2)] = with_cont2
            mc_token_ids[i, 0] = len(with_cont1) - 1
            mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
+            lm_labels[i, 0, : len(with_cont1)] = with_cont1
-            lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
+            lm_labels[i, 1, : len(with_cont2)] = with_cont2
            mc_labels[i] = mc_label
        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
    return tensor_datasets
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='openai-gpt',
+    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
-                        help='pretrained model name')
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument(
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
+        "--output_dir",
-                        help="The output directory where the model predictions and checkpoints will be written.")
+        default=None,
-    parser.add_argument('--train_dataset', type=str, default='')
+        type=str,
-    parser.add_argument('--eval_dataset', type=str, default='')
+        required=True,
-    parser.add_argument('--seed', type=int, default=42)
+        help="The output directory where the model predictions and checkpoints will be written.",
-    parser.add_argument('--num_train_epochs', type=int, default=3)
+    )
-    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument("--train_dataset", type=str, default="")
-    parser.add_argument('--eval_batch_size', type=int, default=16)
+    parser.add_argument("--eval_dataset", type=str, default="")
-    parser.add_argument('--max_grad_norm', type=int, default=1)
+    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
+    parser.add_argument("--num_train_epochs", type=int, default=3)
-    parser.add_argument('--warmup_proportion', type=float, default=0.002)
+    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
+    parser.add_argument("--eval_batch_size", type=int, default=16)
-    parser.add_argument('--weight_decay', type=float, default=0.01)
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument('--lm_coef', type=float, default=0.9)
+    parser.add_argument("--max_grad_norm", type=int, default=1)
-    parser.add_argument('--n_valid', type=int, default=374)
+    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--lm_coef", type=float, default=0.9)
    parser.add_argument("--n_valid", type=int, default=374)
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@ -133,15 +170,15 @@ def main():
    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    special_tokens = ["_start_", "_delimiter_", "_classify_"]
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
-    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
+    tokenizer.add_tokens(special_tokens)
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
+    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)
    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
@ -149,16 +186,20 @@ def main():
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)
    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)
-    # Compute the mex input length for the Transformer
+    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
-    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
+    input_length = max(
-                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
        for dataset in encoded_datasets
        for story, cont1, cont2, _ in dataset
    )
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
    # Prepare inputs tensors and dataloaders
@ -174,19 +215,26 @@ def main():
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
    # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
+    if args.do_train:
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        if args.max_steps > 0:
-    optimizer_grouped_parameters = [
+            t_total = args.max_steps
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        else:
            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
-    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
+        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    optimizer = OpenAIAdam(optimizer_grouped_parameters,
+        scheduler = get_linear_schedule_with_warmup(
-                           lr=args.learning_rate,
+            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-                           warmup=args.warmup_proportion,
+        )
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)
    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
@ -198,26 +246,35 @@ def main():
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
-                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
-                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
+                exp_average_loss = (
                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
                )
                nb_tr_steps += 1
-                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
+                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
    # Save a trained model
    if args.do_train:
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        # Save a trained model, configuration and tokenizer
-        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself
        config = model.config
        torch.save(model_to_save.state_dict(), output_model_file)
-        # Load a trained model that you have fine-tuned
+        # If we save using the predefined names, we can load using `from_pretrained`
-        model_state_dict = torch.load(output_model_file)
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        model = OpenAIGPTDoubleHeadsModel(config)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-        model.load_state_dict(model_state_dict)
+
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)
    if args.do_eval:
@ -228,11 +285,12 @@ def main():
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
-                _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                _, mc_loss, _, mc_logits = model(
-                _, mc_logits = model(input_ids, mc_token_ids)
+                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
                )
            mc_logits = mc_logits.detach().cpu().numpy()
-            mc_labels = mc_labels.to('cpu').numpy()
+            mc_labels = mc_labels.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
            eval_loss += mc_loss.mean().item()
@ -243,10 +301,8 @@ def main():
        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
-        train_loss = tr_loss/nb_tr_steps if args.do_train else None
+        train_loss = tr_loss / nb_tr_steps if args.do_train else None
-        result = {'eval_loss': eval_loss,
+        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
                  'eval_accuracy': eval_accuracy,
                  'train_loss': train_loss}
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
@ -255,5 +311,6 @@ def main():
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    main()
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@ -0,0 +1,737 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """BERT finetuning runner.
   Finetuning the library models for multiple choice on SWAG (Bert).
 """
 import argparse
 import csv
 import glob
 import logging
 import os
 import random
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForMultipleChoice,
    BertTokenizer,
    get_linear_schedule_with_warmup,
 )
 try:
    from torch.utils.tensorboard import SummaryWriter
 except ImportError:
    from tensorboardX import SummaryWriter
 logger = logging.getLogger(__name__)
 ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
 MODEL_CLASSES = {
    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
 }
 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
        self.swag_id = swag_id
        self.context_sentence = context_sentence
        self.start_ending = start_ending
        self.endings = [
            ending_0,
            ending_1,
            ending_2,
            ending_3,
        ]
        self.label = label
    def __str__(self):
        return self.__repr__()
    def __repr__(self):
        attributes = [
            "swag_id: {}".format(self.swag_id),
            "context_sentence: {}".format(self.context_sentence),
            "start_ending: {}".format(self.start_ending),
            "ending_0: {}".format(self.endings[0]),
            "ending_1: {}".format(self.endings[1]),
            "ending_2: {}".format(self.endings[2]),
            "ending_3: {}".format(self.endings[3]),
        ]
        if self.label is not None:
            attributes.append("label: {}".format(self.label))
        return ", ".join(attributes)
 class InputFeatures(object):
    def __init__(self, example_id, choices_features, label):
        self.example_id = example_id
        self.choices_features = [
            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
            for _, input_ids, input_mask, segment_ids in choices_features
        ]
        self.label = label
 def read_swag_examples(input_file, is_training=True):
    with open(input_file, "r", encoding="utf-8") as f:
        lines = list(csv.reader(f))
    if is_training and lines[0][-1] != "label":
        raise ValueError("For training, the input file must contain a label column.")
    examples = [
        SwagExample(
            swag_id=line[2],
            context_sentence=line[4],
            start_ending=line[5],  # in the swag dataset, the
            # common beginning of each
            # choice is stored in "sent2".
            ending_0=line[7],
            ending_1=line[8],
            ending_2=line[9],
            ending_3=line[10],
            label=int(line[11]) if is_training else None,
        )
        for line in lines[1:]  # we skip the line with the column names
    ]
    return examples
 def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
    """Loads a data file into a list of `InputBatch`s."""
    # Swag is a multiple choice task. To perform this task using Bert,
    # we will use the formatting proposed in "Improving Language
    # Understanding by Generative Pre-Training" and suggested by
    # @jacobdevlin-google in this issue
    # https://github.com/google-research/bert/issues/38.
    #
    # Each choice will correspond to a sample on which we run the
    # inference. For a given Swag example, we will create the 4
    # following inputs:
    # - [CLS] context [SEP] choice_1 [SEP]
    # - [CLS] context [SEP] choice_2 [SEP]
    # - [CLS] context [SEP] choice_3 [SEP]
    # - [CLS] context [SEP] choice_4 [SEP]
    # The model will output a single value for each input. To get the
    # final decision of the model, we will run a softmax over these 4
    # outputs.
    features = []
    for example_index, example in tqdm(enumerate(examples)):
        context_tokens = tokenizer.tokenize(example.context_sentence)
        start_ending_tokens = tokenizer.tokenize(example.start_ending)
        choices_features = []
        for ending_index, ending in enumerate(example.endings):
            # We create a copy of the context tokens in order to be
            # able to shrink it according to ending_tokens
            context_tokens_choice = context_tokens[:]
            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
            # Modifies `context_tokens_choice` and `ending_tokens` in
            # place so that the total length is less than the
            # specified length.  Account for [CLS], [SEP], [SEP] with
            # "- 3"
            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            # Zero-pad up to the sequence length.
            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            segment_ids += padding
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            choices_features.append((tokens, input_ids, input_mask, segment_ids))
        label = example.label
        if example_index < 5:
            logger.info("*** Example ***")
            logger.info("swag_id: {}".format(example.swag_id))
            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
                logger.info("choice: {}".format(choice_idx))
                logger.info("tokens: {}".format(" ".join(tokens)))
                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
            if is_training:
                logger.info("label: {}".format(label))
        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
    return features
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)
 def select_field(features, field):
    return [[choice[field] for choice in feature.choices_features] for feature in features]
 def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_swag_examples(input_file)
        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
    all_label = torch.tensor([f.label for f in features], dtype=torch.long)
    if evaluate:
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
    else:
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
    if output_examples:
        return dataset, examples, features
    return dataset
 def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                # 'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
                "token_type_ids": batch[2],
                "labels": batch[3],
            }
            # if args.model_type in ['xlnet', 'xlm']:
            #     inputs.update({'cls_index': batch[5],
            #                    'p_mask':       batch[6]})
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_vocabulary(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    if args.local_rank in [-1, 0]:
        tb_writer.close()
    return global_step, tr_loss / global_step
 def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
                "token_type_ids": batch[2],
                "labels": batch[3],
            }
            # if args.model_type in ['xlnet', 'xlm']:
            #     inputs.update({'cls_index': batch[4],
            #                    'p_mask':    batch[5]})
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.mean().item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].to("cpu").numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        nb_eval_examples += inputs["input_ids"].size(0)
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("%s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    return result
 def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        required=True,
        help="SWAG csv for predictions. E.g., val.csv or test.csv",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )
    # Other parameters
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )
    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set seed
    set_seed(args)
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
    )
    model = model_class.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    # Save the trained model and the tokenizer
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)
    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        if args.do_train:
            checkpoints = [args.output_dir]
        else:
            # if do_train is False and do_eval is true, load model directly from pretrained.
            checkpoints = [args.model_name_or_path]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            tokenizer = tokenizer_class.from_pretrained(checkpoint)
            model.to(args.device)
            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)
    logger.info("Results: {}".format(results))
    return results
 if __name__ == "__main__":
    main()
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -19,55 +19,48 @@
    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
 """
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 import argparse
 import logging
 import time
 import math
 import time
 import torch
-from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
+from transformers import TransfoXLCorpus, TransfoXLLMHeadModel
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+
-                    datefmt = '%m/%d/%Y %H:%M:%S',
+logging.basicConfig(
-                    level = logging.INFO)
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
 logger = logging.getLogger(__name__)
 def main():
-    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
-    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
+    parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
-                        help='pretrained model name')
+    parser.add_argument(
-    parser.add_argument('--split', type=str, default='test',
+        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
-                        choices=['all', 'valid', 'test'],
+    )
-                        help='which split to evaluate')
+    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
-    parser.add_argument('--batch_size', type=int, default=10,
+    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
-                        help='batch size')
+    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
-    parser.add_argument('--tgt_len', type=int, default=128,
+    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
-                        help='number of tokens to predict')
+    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
-    parser.add_argument('--ext_len', type=int, default=0,
+    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
-                        help='length of the extended context')
+    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
-    parser.add_argument('--mem_len', type=int, default=1600,
+    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
-                        help='length of the retained previous heads')
+    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
-    parser.add_argument('--clamp_len', type=int, default=1000,
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-                        help='max positional embedding index')
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument('--no_cuda', action='store_true',
                        help='Do not use CUDA even though CUA is available')
    parser.add_argument('--work_dir', type=str, required=True,
                        help='path to the work_dir')
    parser.add_argument('--no_log', action='store_true',
                        help='do not log the eval result')
    parser.add_argument('--same_length', action='store_true',
                        help='set same length attention with masking')
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
-    assert args.ext_len >= 0, 'extended context length must be non-negative'
+    assert args.ext_len >= 0, "extended context length must be non-negative"
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@ -81,19 +74,19 @@ def main():
    # and tokenizing the dataset
    # The pre-processed corpus is a convertion (using the conversion script )
    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
    ntokens = len(corpus.vocab)
-    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
+    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
-        device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
        device=device, ext_len=args.ext_len)
    # Load a pre-trained model
    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
    model = model.to(device)
-    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+    logger.info(
-        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
        )
    )
    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
    if args.clamp_len > 0:
@ -107,46 +100,45 @@ def main():
    def evaluate(eval_iter):
        # Turn on evaluation mode which disables dropout.
        model.eval()
-        total_len, total_loss = 0, 0.
+        total_len, total_loss = 0, 0.0
        start_time = time.time()
        with torch.no_grad():
            mems = None
            for idx, (data, target, seq_len) in enumerate(eval_iter):
-                ret = model(data, target, mems)
+                ret = model(data, lm_labels=target, mems=mems)
-                loss, mems = ret
+                loss, _, mems = ret
                loss = loss.mean()
                total_loss += seq_len * loss.item()
                total_len += seq_len
            total_time = time.time() - start_time
-        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
+        logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
                total_time, 1000 * total_time / (idx+1)))
        return total_loss / total_len
    # Run on test data.
-    if args.split == 'all':
+    if args.split == "all":
        test_loss = evaluate(te_iter)
        valid_loss = evaluate(va_iter)
-    elif args.split == 'valid':
+    elif args.split == "valid":
        valid_loss = evaluate(va_iter)
        test_loss = None
-    elif args.split == 'test':
+    elif args.split == "test":
        test_loss = evaluate(te_iter)
        valid_loss = None
    def format_log(loss, split):
-        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
            split, loss, math.exp(loss))
        return log_str
-    log_str = ''
+    log_str = ""
    if valid_loss is not None:
-        log_str += format_log(valid_loss, 'valid')
+        log_str += format_log(valid_loss, "valid")
    if test_loss is not None:
-        log_str += format_log(test_loss, 'test')
+        log_str += format_log(test_loss, "test")
-    logger.info('=' * 100)
+    logger.info("=" * 100)
    logger.info(log_str)
-    logger.info('=' * 100)
+    logger.info("=" * 100)
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    main()
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@ -0,0 +1,186 @@
 # Distil*
 This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
 **January 20, 2020 - Bug fixing** We have recently discovered and fixed [a bug](https://github.com/huggingface/transformers/commit/48cbf267c988b56c71a2380f748a3e6092ccaed3) in the evaluation of our `run_*.py` scripts that caused the reported metrics to be over-estimated on average. We have updated all the metrics with the latest runs.
 **December 6, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
 **November 19, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
 **October 23, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
 **October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
 **September 19, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 99% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
 ## What is Distil*
 Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 99% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
 We have applied the same method to other Transformer architectures and released the weights:
 - GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
 - RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
 - German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
 - Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
 For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
 Here are the results on the dev sets of GLUE:
 | Model                     | Macro-score                    | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI              |
 | :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
 | BERT-base-uncased         |  **77.6**                      | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1              |
 | DistilBERT-base-uncased   |  **76.8**                      | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3              |
 | ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
 | RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup>  |
 | DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |
 <sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
 <sup>2</sup> Macro-score computed without WNLI.
 <sup>3</sup> We compute this score ourselves for completeness.
 Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
 | Model                        | English | Spanish | Chinese | German | Arabic  | Urdu |
 | :---:                        | :---:   | :---:   | :---:   | :---:  | :---:   | :---:|
 | mBERT base cased (computed)  | 82.1    | 74.6    | 69.1    | 72.3   | 66.4    | 58.5 |
 | mBERT base uncased (reported)| 81.4    | 74.3    | 63.8    | 70.5   | 62.1    | 58.3 |
 | DistilmBERT                  | 78.2    | 69.1    | 64.0    | 66.3   | 59.1    | 54.7 |
 ## Setup
 This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
 **Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0).
 ## How to use DistilBERT
 Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
 - `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
 - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
 - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
 - `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.
 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
 ```python
 tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
 model = DistilBertModel.from_pretrained('distilbert-base-uncased')
 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
 outputs = model(input_ids)
 last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 ```
 Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
 - DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
 - DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
 - DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
 ## How to train Distil*
 In the following, we will explain how you can train DistilBERT.
 ### A. Preparing the data
 The weights we release are trained using a concatenation of Toronto Book Corpus and English Wikipedia (same training data as the English version of BERT).
 To avoid processing the data several time, we do it once and for all before the training. From now on, will suppose that you have a text file `dump.txt` which contains one sequence per line (a sequence being composed of one of several coherent sentences).
 First, we will binarize the data, i.e. tokenize the data and convert each token in an index in our model's vocabulary.
 ```bash
 python scripts/binarized_data.py \
    --file_path data/dump.txt \
    --tokenizer_type bert \
    --tokenizer_name bert-base-uncased \
    --dump_file data/binarized_text
 ```
 Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurences of each tokens in the data:
 ```bash
 python scripts/token_counts.py \
    --data_file data/binarized_text.bert-base-uncased.pickle \
    --token_counts_dump data/token_counts.bert-base-uncased.pickle \
    --vocab_size 30522
 ```
 ### B. Training
 Training with distillation is really simple once you have pre-processed the data:
 ```bash
 python train.py \
    --student_type distilbert \
    --student_config training_configs/distilbert-base-uncased.json \
    --teacher_type bert \
    --teacher_name bert-base-uncased \
    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --alpha_clm 0.0 --mlm \
    --freeze_pos_embs \
    --dump_path serialization_dir/my_first_training \
    --data_file data/binarized_text.bert-base-uncased.pickle \
    --token_counts data/token_counts.bert-base-uncased.pickle \
    --force # overwrites the `dump_path` if it already exists.
 ```
 By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
 We highly encourage you to use distributed training for training DistilBERT as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
 ```bash
 export NODE_RANK=0
 export N_NODES=1
 export N_GPU_NODE=4
 export WORLD_SIZE=4
 export MASTER_PORT=<AN_OPEN_PORT>
 export MASTER_ADDR=<I.P.>
 pkill -f 'python -u train.py'
 python -m torch.distributed.launch \
    --nproc_per_node=$N_GPU_NODE \
    --nnodes=$N_NODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT \
    train.py \
        --force \
        --n_gpu $WORLD_SIZE \
        --student_type distilbert \
        --student_config training_configs/distilbert-base-uncased.json \
        --teacher_type bert \
        --teacher_name bert-base-uncased \
        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --alpha_clm 0.0 --mlm \
        --freeze_pos_embs \
        --dump_path serialization_dir/my_first_training \
        --data_file data/binarized_text.bert-base-uncased.pickle \
        --token_counts data/token_counts.bert-base-uncased.pickle
 ```
 **Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
 Happy distillation!
 ## Citation
 If you find the ressource useful, you should cite the following paper:
 ```
@inproceedings{sanh2019distilbert,
  title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter},
  author={Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas},
  booktitle={NeurIPS EMC^2 Workshop},
  year={2019}
 }
 ```
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@ -0,0 +1,603 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ The distiller to distil the student.
    Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
 import math
 import os
 import time
 import psutil
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
 from lm_seqs_dataset import LmSeqsDataset
 from transformers import get_linear_schedule_with_warmup
 from utils import logger
 try:
    from torch.utils.tensorboard import SummaryWriter
 except ImportError:
    from tensorboardX import SummaryWriter
 class Distiller:
    def __init__(
        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
    ):
        logger.info("Initializing Distiller")
        self.params = params
        self.dump_path = params.dump_path
        self.multi_gpu = params.multi_gpu
        self.fp16 = params.fp16
        self.student = student
        self.teacher = teacher
        self.student_config = student.config
        self.vocab_size = student.config.vocab_size
        if params.n_gpu <= 1:
            sampler = RandomSampler(dataset)
        else:
            sampler = DistributedSampler(dataset)
        if params.group_by_size:
            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
        else:
            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)
        self.temperature = params.temperature
        assert self.temperature > 0.0
        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
        self.alpha_clm = params.alpha_clm
        self.alpha_mse = params.alpha_mse
        self.alpha_cos = params.alpha_cos
        self.mlm = params.mlm
        if self.mlm:
            logger.info(f"Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
            if self.fp16:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
            logger.info(f"Using CLM loss for LM step.")
        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sequences_epoch = 0
        self.total_loss_epoch = 0
        self.last_loss = 0
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
        self.last_loss_clm = 0
        if self.alpha_mse > 0.0:
            self.last_loss_mse = 0
        if self.alpha_cos > 0.0:
            self.last_loss_cos = 0
        self.last_log = 0
        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        if self.alpha_mse > 0.0:
            self.mse_loss_fct = nn.MSELoss(reduction="sum")
        if self.alpha_cos > 0.0:
            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")
        logger.info("--- Initializing model optimizer")
        assert params.gradient_accumulation_steps >= 1
        self.num_steps_epoch = len(self.dataloader)
        num_train_optimization_steps = (
            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
        )
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay": params.weight_decay,
            },
            {
                "params": [
                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay": 0.0,
            },
        ]
        logger.info(
            "------ Number of trainable parameters (student): %i"
            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
        )
        logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
        self.optimizer = AdamW(
            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
        )
        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
        )
        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
            self.student, self.optimizer = amp.initialize(
                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
            )
            self.teacher = self.teacher.half()
        if self.multi_gpu:
            if self.fp16:
                from apex.parallel import DistributedDataParallel
                logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(self.student)
            else:
                from torch.nn.parallel import DistributedDataParallel
                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(
                    self.student,
                    device_ids=[params.local_rank],
                    output_device=params.local_rank,
                    find_unused_parameters=True,
                )
        self.is_master = params.is_master
        if self.is_master:
            logger.info("--- Initializing Tensorboard")
            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
    def prepare_batch_mlm(self, batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
        Input:
        ------
            batch: `Tuple`
                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
        Output:
        -------
            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
            mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
        """
        token_ids, lengths = batch
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
        assert token_ids.size(0) == lengths.size(0)
        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
        bs, max_seq_len = token_ids.size()
        mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
        x_prob = self.token_probs[token_ids.flatten()]
        n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
        tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
        pred_mask = torch.zeros(
            bs * max_seq_len, dtype=torch.bool, device=token_ids.device
        )  # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
        pred_mask[tgt_ids] = 1
        pred_mask = pred_mask.view(bs, max_seq_len)
        pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0
        # mask a number of words == 0 [8] (faster with fp16)
        if self.fp16:
            n1 = pred_mask.sum().item()
            if n1 > 8:
                pred_mask = pred_mask.view(-1)
                n2 = max(n1 % 8, 8 * (n1 // 8))
                if n2 != n1:
                    pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
                pred_mask = pred_mask.view(bs, max_seq_len)
                assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
        _token_ids_real = token_ids[pred_mask]
        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
        probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
        _token_ids = (
            _token_ids_mask * (probs == 0).long()
            + _token_ids_real * (probs == 1).long()
            + _token_ids_rand * (probs == 2).long()
        )
        token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
        mlm_labels[~pred_mask] = -100  # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
        return token_ids, attn_mask, mlm_labels
    def prepare_batch_clm(self, batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
        Input:
        ------
            batch: `Tuple`
                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
        Output:
        -------
            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
        """
        token_ids, lengths = batch
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
        assert token_ids.size(0) == lengths.size(0)
        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
        clm_labels[~attn_mask] = -100  # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
        return token_ids, attn_mask, clm_labels
    def round_batch(self, x: torch.tensor, lengths: torch.tensor):
        """
        For float16 only.
        Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
        Input:
        ------
            x: `torch.tensor(bs, seq_length)` - The token ids.
            lengths: `torch.tensor(bs, seq_length)` - The lengths of each of the sequence in the batch.
        Output:
        -------
            x:  `torch.tensor(new_bs, new_seq_length)` - The updated token ids.
            lengths: `torch.tensor(new_bs, new_seq_length)` - The updated lengths.
        """
        if not self.fp16 or len(lengths) < 8:
            return x, lengths
        # number of sentences == 0 [8]
        bs1 = len(lengths)
        bs2 = 8 * (bs1 // 8)
        assert bs2 > 0 and bs2 % 8 == 0
        if bs1 != bs2:
            idx = torch.randperm(bs1)[:bs2]
            lengths = lengths[idx]
            slen = lengths.max().item()
            x = x[idx, :slen]
        else:
            idx = None
        # sequence length == 0 [8]
        ml1 = x.size(1)
        if ml1 % 8 != 0:
            pad = 8 - (ml1 % 8)
            ml2 = ml1 + pad
            if self.mlm:
                pad_id = self.params.special_tok_ids["pad_token"]
            else:
                pad_id = self.params.special_tok_ids["unk_token"]
            padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
            x = torch.cat([x, padding_tensor], 1)
            assert x.size() == (bs2, ml2)
        assert x.size(0) % 8 == 0
        assert x.size(1) % 8 == 0
        return x, lengths
    def train(self):
        """
        The real training loop.
        """
        if self.is_master:
            logger.info("Starting training")
        self.last_log = time.time()
        self.student.train()
        self.teacher.eval()
        for _ in range(self.params.n_epoch):
            if self.is_master:
                logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
            if self.multi_gpu:
                torch.distributed.barrier()
            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
            for batch in iter_bar:
                if self.params.n_gpu > 0:
                    batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)
                if self.mlm:
                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
                else:
                    token_ids, attn_mask, lm_labels = self.prepare_batch_clm(batch=batch)
                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
                iter_bar.update()
                iter_bar.set_postfix(
                    {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
                )
            iter_bar.close()
            if self.is_master:
                logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
            self.end_epoch()
        if self.is_master:
            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
            logger.info("Training is finished")
    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
        """
        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
        and possibly a parameter update (depending on the gradient accumulation).
        Input:
        ------
        input_ids: `torch.tensor(bs, seq_length)` - The token ids.
        attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
        """
        if self.mlm:
            s_logits, s_hidden_states = self.student(
                input_ids=input_ids, attention_mask=attention_mask
            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
                t_logits, t_hidden_states = self.teacher(
                    input_ids=input_ids, attention_mask=attention_mask
                )  # (bs, seq_length, voc_size)
        else:
            s_logits, _, s_hidden_states = self.student(
                input_ids=input_ids, attention_mask=None
            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
                t_logits, _, t_hidden_states = self.teacher(
                    input_ids=input_ids, attention_mask=None
                )  # (bs, seq_length, voc_size)
        assert s_logits.size() == t_logits.size()
        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
        if self.params.restrict_ce_to_mask:
            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
        else:
            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        assert t_logits_slct.size() == s_logits_slct.size()
        loss_ce = (
            self.ce_loss_fct(
                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
                F.softmax(t_logits_slct / self.temperature, dim=-1),
            )
            * (self.temperature) ** 2
        )
        loss = self.alpha_ce * loss_ce
        if self.alpha_mlm > 0.0:
            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
            loss += self.alpha_mlm * loss_mlm
        if self.alpha_clm > 0.0:
            shift_logits = s_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss += self.alpha_clm * loss_clm
        if self.alpha_mse > 0.0:
            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
                0
            )  # Reproducing batchmean reduction
            loss += self.alpha_mse * loss_mse
        if self.alpha_cos > 0.0:
            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
            assert s_hidden_states.size() == t_hidden_states.size()
            dim = s_hidden_states.size(-1)
            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
            loss += self.alpha_cos * loss_cos
        self.total_loss_epoch += loss.item()
        self.last_loss = loss.item()
        self.last_loss_ce = loss_ce.item()
        if self.alpha_mlm > 0.0:
            self.last_loss_mlm = loss_mlm.item()
        if self.alpha_clm > 0.0:
            self.last_loss_clm = loss_clm.item()
        if self.alpha_mse > 0.0:
            self.last_loss_mse = loss_mse.item()
        if self.alpha_cos > 0.0:
            self.last_loss_cos = loss_cos.item()
        self.optimize(loss)
        self.n_sequences_epoch += input_ids.size(0)
    def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
        Also update the metrics for tensorboard.
        """
        # Check for NaN
        if (loss != loss).data.any():
            logger.error("NaN detected")
            exit()
        if self.multi_gpu:
            loss = loss.mean()
        if self.params.gradient_accumulation_steps > 1:
            loss = loss / self.params.gradient_accumulation_steps
        if self.fp16:
            from apex import amp
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step()
    def iter(self):
        """
        Update global counts, write to tensorboard and save checkpoint.
        """
        self.n_iter += 1
        self.n_total_iter += 1
        if self.n_total_iter % self.params.log_interval == 0:
            self.log_tensorboard()
            self.last_log = time.time()
        if self.n_total_iter % self.params.checkpoint_interval == 0:
            self.save_checkpoint()
    def log_tensorboard(self):
        """
        Log into tensorboard. Only by the master process.
        """
        if not self.is_master:
            return
        for param_name, param in self.student.named_parameters():
            self.tensorboard.add_scalar(
                tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
            )
            self.tensorboard.add_scalar(
                tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
            )
            if param.grad is None:
                continue
            self.tensorboard.add_scalar(
                tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
            )
            self.tensorboard.add_scalar(
                tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
            )
        self.tensorboard.add_scalar(
            tag="losses/cum_avg_loss_epoch",
            scalar_value=self.total_loss_epoch / self.n_iter,
            global_step=self.n_total_iter,
        )
        self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
        self.tensorboard.add_scalar(
            tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
        )
        if self.alpha_mlm > 0.0:
            self.tensorboard.add_scalar(
                tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
            )
        if self.alpha_clm > 0.0:
            self.tensorboard.add_scalar(
                tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
            )
        if self.alpha_mse > 0.0:
            self.tensorboard.add_scalar(
                tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
            )
        if self.alpha_cos > 0.0:
            self.tensorboard.add_scalar(
                tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
            )
        self.tensorboard.add_scalar(
            tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
        )
        self.tensorboard.add_scalar(
            tag="global/memory_usage",
            scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
            global_step=self.n_total_iter,
        )
        self.tensorboard.add_scalar(
            tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
        )
    def end_epoch(self):
        """
        Finally arrived at the end of epoch (full pass on dataset).
        Do some tensorboard logging and checkpoint saving.
        """
        logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")
        if self.is_master:
            self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
            self.tensorboard.add_scalar(
                tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
            )
        self.epoch += 1
        self.n_sequences_epoch = 0
        self.n_iter = 0
        self.total_loss_epoch = 0
    def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
        """
        Save the current state. Only by the master process.
        """
        if not self.is_master:
            return
        mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
        mdl_to_save.config.save_pretrained(self.dump_path)
        state_dict = mdl_to_save.state_dict()
        torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@ -0,0 +1,108 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
 """
 import bisect
 import copy
 from collections import defaultdict
 import numpy as np
 from torch.utils.data.sampler import BatchSampler, Sampler
 from utils import logger
 def _quantize(x, bins):
    bins = copy.deepcopy(bins)
    bins = sorted(bins)
    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
    return quantized
 def create_lengths_groups(lengths, k=0):
    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
    groups = _quantize(lengths, bins)
    # count number of elements per group
    counts = np.unique(groups, return_counts=True)[1]
    fbins = [0] + bins + [np.inf]
    logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
    logger.info("Count of instances per bin: {}".format(counts))
    return groups
 class GroupedBatchSampler(BatchSampler):
    """
    Wraps another sampler to yield a mini-batch of indices.
    It enforces that the batch only contain elements from the same group.
    It also tries to provide mini-batches which follows an ordering which is
    as close as possible to the ordering from the original sampler.
    Arguments:
        sampler (Sampler): Base sampler.
        group_ids (list[int]): If the sampler produces indices in range [0, N),
            `group_ids` must be a list of `N` ints which contains the group id of each sample.
            The group ids must be a continuous set of integers starting from
            0, i.e. they must be in the range [0, num_groups).
        batch_size (int): Size of mini-batch.
    """
    def __init__(self, sampler, group_ids, batch_size):
        if not isinstance(sampler, Sampler):
            raise ValueError(
                "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
            )
        self.sampler = sampler
        self.group_ids = group_ids
        self.batch_size = batch_size
    def __iter__(self):
        buffer_per_group = defaultdict(list)
        samples_per_group = defaultdict(list)
        num_batches = 0
        for idx in self.sampler:
            group_id = self.group_ids[idx]
            buffer_per_group[group_id].append(idx)
            samples_per_group[group_id].append(idx)
            if len(buffer_per_group[group_id]) == self.batch_size:
                yield buffer_per_group[group_id]  # TODO
                num_batches += 1
                del buffer_per_group[group_id]
            assert len(buffer_per_group[group_id]) < self.batch_size
        # now we have run out of elements that satisfy
        # the group criteria, let's return the remaining
        # elements so that the size of the sampler is
        # deterministic
        expected_num_batches = len(self)
        num_remaining = expected_num_batches - num_batches
        if num_remaining > 0:
            # for the remaining batches, group the batches by similar lengths
            batch_idx = []
            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
                batch_idx.extend(idxs)
                if len(batch_idx) >= self.batch_size:
                    yield batch_idx[: self.batch_size]
                    batch_idx = batch_idx[self.batch_size :]
                    num_remaining -= 1
            if len(batch_idx) > 0:
                yield batch_idx
                num_remaining -= 1
        assert num_remaining == 0
    def __len__(self):
        """
        Return the number of mini-batches rather than the number of samples.
        """
        return (len(self.sampler) + self.batch_size - 1) // self.batch_size
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@ -0,0 +1,166 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Dataset to distilled models
    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
 import numpy as np
 import torch
 from torch.utils.data import Dataset
 from utils import logger
 class LmSeqsDataset(Dataset):
    """Custom Dataset wrapping language modeling sequences.
    Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
    Input:
    ------
        params: `NameSpace` parameters
        data: `List[np.array[int]]
    """
    def __init__(self, params, data):
        self.params = params
        self.token_ids = np.array(data)
        self.lengths = np.array([len(t) for t in data])
        self.check()
        self.remove_long_sequences()
        self.remove_empty_sequences()
        self.remove_unknown_sequences()
        self.check()
        self.print_statistics()
    def __getitem__(self, index):
        return (self.token_ids[index], self.lengths[index])
    def __len__(self):
        return len(self.lengths)
    def check(self):
        """
        Some sanity checks
        """
        assert len(self.token_ids) == len(self.lengths)
        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
    def remove_long_sequences(self):
        """
        Sequences that are too long are splitted by chunk of max_model_input_size.
        """
        max_len = self.params.max_model_input_size
        indices = self.lengths > max_len
        logger.info(f"Splitting {sum(indices)} too long sequences.")
        def divide_chunks(l, n):
            return [l[i : i + n] for i in range(0, len(l), n)]
        new_tok_ids = []
        new_lengths = []
        if self.params.mlm:
            cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
        else:
            cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]
        for seq_, len_ in zip(self.token_ids, self.lengths):
            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
            if len_ <= max_len:
                new_tok_ids.append(seq_)
                new_lengths.append(len_)
            else:
                sub_seqs = []
                for sub_s in divide_chunks(seq_, max_len - 2):
                    if sub_s[0] != cls_id:
                        sub_s = np.insert(sub_s, 0, cls_id)
                    if sub_s[-1] != sep_id:
                        sub_s = np.insert(sub_s, len(sub_s), sep_id)
                    assert len(sub_s) <= max_len
                    assert (sub_s[0] == cls_id) and (sub_s[-1] == sep_id), sub_s
                    sub_seqs.append(sub_s)
                new_tok_ids.extend(sub_seqs)
                new_lengths.extend([len(l) for l in sub_seqs])
        self.token_ids = np.array(new_tok_ids)
        self.lengths = np.array(new_lengths)
    def remove_empty_sequences(self):
        """
        Too short sequences are simply removed. This could be tunedd.
        """
        init_size = len(self)
        indices = self.lengths > 11
        self.token_ids = self.token_ids[indices]
        self.lengths = self.lengths[indices]
        new_size = len(self)
        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
    def remove_unknown_sequences(self):
        """
        Remove sequences with a (too) high level of unknown tokens.
        """
        if "unk_token" not in self.params.special_tok_ids:
            return
        else:
            unk_token_id = self.params.special_tok_ids["unk_token"]
        init_size = len(self)
        unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
        indices = (unk_occs / self.lengths) < 0.5
        self.token_ids = self.token_ids[indices]
        self.lengths = self.lengths[indices]
        new_size = len(self)
        logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
    def print_statistics(self):
        """
        Print some statistics on the corpus. Only the master process.
        """
        if not self.params.is_master:
            return
        logger.info(f"{len(self)} sequences")
        # data_len = sum(self.lengths)
        # nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
        # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
        # unk_idx = self.params.special_tok_ids['unk_token']
        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
    def batch_sequences(self, batch):
        """
        Do the padding and transform into torch.tensor.
        """
        token_ids = [t[0] for t in batch]
        lengths = [t[1] for t in batch]
        assert len(token_ids) == len(lengths)
        # Max for paddings
        max_seq_len_ = max(lengths)
        # Pad token ids
        if self.params.mlm:
            pad_idx = self.params.special_tok_ids["pad_token"]
        else:
            pad_idx = self.params.special_tok_ids["unk_token"]
        tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
        assert len(tk_) == len(token_ids)
        assert all(len(t) == max_seq_len_ for t in tk_)
        tk_t = torch.tensor(tk_)  # (bs, max_seq_len_)
        lg_t = torch.tensor(lengths)  # (bs)
        return tk_t, lg_t
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@ -0,0 +1,7 @@
 transformers
 gitpython==3.0.2
 tensorboard>=1.14.0
 tensorboardX==1.8
 psutil==5.6.3
 scipy==1.3.1
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@ -0,0 +1,864 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ This is the exact same script as `examples/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
 import argparse
 import glob
 import logging
 import os
 import random
 import timeit
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForQuestionAnswering,
    BertTokenizer,
    DistilBertConfig,
    DistilBertForQuestionAnswering,
    DistilBertTokenizer,
    XLMConfig,
    XLMForQuestionAnswering,
    XLMTokenizer,
    XLNetConfig,
    XLNetForQuestionAnswering,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
 )
 from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
 )
 from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
 try:
    from torch.utils.tensorboard import SummaryWriter
 except ImportError:
    from tensorboardX import SummaryWriter
 logger = logging.getLogger(__name__)
 ALL_MODELS = sum(
    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
 )
 MODEL_CLASSES = {
    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
 }
 def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
 def to_list(tensor):
    return tensor.detach().cpu().tolist()
 def train(args, train_dataset, model, tokenizer, teacher=None):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    # Added here for reproductibility
    set_seed(args)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            model.train()
            if teacher is not None:
                teacher.eval()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
            outputs = model(**inputs)
            loss, start_logits_stu, end_logits_stu = outputs
            # Distillation loss
            if teacher is not None:
                if "token_type_ids" not in inputs:
                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
                with torch.no_grad():
                    start_logits_tea, end_logits_tea = teacher(
                        input_ids=inputs["input_ids"],
                        token_type_ids=inputs["token_type_ids"],
                        attention_mask=inputs["attention_mask"],
                    )
                assert start_logits_tea.size() == start_logits_stu.size()
                assert end_logits_tea.size() == end_logits_stu.size()
                loss_fct = nn.KLDivLoss(reduction="batchmean")
                loss_start = loss_fct(
                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
                    F.softmax(start_logits_tea / args.temperature, dim=-1),
                ) * (args.temperature ** 2)
                loss_end = loss_fct(
                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
                    F.softmax(end_logits_tea / args.temperature, dim=-1),
                ) * (args.temperature ** 2)
                loss_ce = (loss_start + loss_end) / 2.0
                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                # Log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    if args.local_rank in [-1, 0]:
        tb_writer.close()
    return global_step, tr_loss / global_step
 def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
            example_indices = batch[3]
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
            outputs = model(**inputs)
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            output = [to_list(output[i]) for output in outputs]
            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]
                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )
            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None
    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            model.config.start_n_top,
            model.config.end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )
    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        "cached_distillation_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        try:
            features, dataset, examples = (
                features_and_dataset["features"],
                features_and_dataset["dataset"],
                features_and_dataset["examples"],
            )
        except KeyError:
            raise DeprecationWarning(
                "You seem to be loading features from an older version of this script please delete the "
                "file %s in order for it to be created again" % cached_features_file
            )
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
        else:
            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    if output_examples:
        return dataset, examples, features
    return dataset
 def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )
    # Distillation parameters (optional)
    parser.add_argument(
        "--teacher_type",
        default=None,
        type=str,
        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
    )
    parser.add_argument(
        "--teacher_name_or_path",
        default=None,
        type=str,
        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.",
    )
    parser.add_argument(
        "--alpha_ce", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
    )
    parser.add_argument(
        "--alpha_squad", default=0.5, type=float, help="True SQuAD loss linear weight. Only for distillation."
    )
    parser.add_argument(
        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
    )
    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there"
        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there"
        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help="If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help="If null_score - best_non_null is greater than the threshold predict null.",
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )
    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help="If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.",
    )
    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
    args = parser.parse_args()
    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set seed
    set_seed(args)
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    if args.teacher_type is not None:
        assert args.teacher_name_or_path is not None
        assert args.alpha_ce > 0.0
        assert args.alpha_ce + args.alpha_squad > 0.0
        assert args.teacher_type != "distilbert", "We constraint teachers not to be of type DistilBERT."
        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
        teacher_config = teacher_config_class.from_pretrained(
            args.teacher_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None
        )
        teacher = teacher_model_class.from_pretrained(
            args.teacher_name_or_path, config=teacher_config, cache_dir=args.cache_dir if args.cache_dir else None
        )
        teacher.to(args.device)
    else:
        teacher = None
    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)
    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        if args.do_train:
            logger.info("Loading checkpoints saved during training for evaluation")
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)
    logger.info("Results: {}".format(results))
    return results
 if __name__ == "__main__":
    main()
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@ -0,0 +1,92 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Preprocessing script before distillation.
 """
 import argparse
 import logging
 import pickle
 import random
 import time
 import numpy as np
 from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer
 logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
 logger = logging.getLogger(__name__)
 def main():
    parser = argparse.ArgumentParser(
        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
    )
    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
    args = parser.parse_args()
    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
    if args.tokenizer_type == "bert":
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
    elif args.tokenizer_type == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
    elif args.tokenizer_type == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`
    logger.info(f"Loading text from {args.file_path}")
    with open(args.file_path, "r", encoding="utf8") as fp:
        data = fp.readlines()
    logger.info(f"Start encoding")
    logger.info(f"{len(data)} examples to process.")
    rslt = []
    iter = 0
    interval = 10000
    start = time.time()
    for text in data:
        text = f"{bos} {text.strip()} {sep}"
        token_ids = tokenizer.encode(text, add_special_tokens=False)
        rslt.append(token_ids)
        iter += 1
        if iter % interval == 0:
            end = time.time()
            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
            start = time.time()
    logger.info("Finished binarization")
    logger.info(f"{len(data)} examples processed.")
    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
    rslt_ = [np.uint16(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f"Dump to {dp_file}")
    with open(dp_file, "wb") as handle:
        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
 if __name__ == "__main__":
    main()
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@ -0,0 +1,102 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Preprocessing script before training the distilled model.
 Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
 """
 import argparse
 import torch
 from transformers import GPT2LMHeadModel, RobertaForMaskedLM
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
    )
    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
    parser.add_argument("--model_name", default="roberta-large", type=str)
    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()
    if args.model_type == "roberta":
        model = RobertaForMaskedLM.from_pretrained(args.model_name)
        prefix = "roberta"
    elif args.model_type == "gpt2":
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
        prefix = "transformer"
    state_dict = model.state_dict()
    compressed_sd = {}
    # Embeddings #
    if args.model_type == "gpt2":
        for param_name in ["wte.weight", "wpe.weight"]:
            compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
    else:
        for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
            param_name = f"{prefix}.embeddings.{w}.weight"
            compressed_sd[param_name] = state_dict[param_name]
        for w in ["weight", "bias"]:
            param_name = f"{prefix}.embeddings.LayerNorm.{w}"
            compressed_sd[param_name] = state_dict[param_name]
    # Transformer Blocks #
    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
        if args.model_type == "gpt2":
            for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
                for w in ["weight", "bias"]:
                    compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
                        f"{prefix}.h.{teacher_idx}.{layer}.{w}"
                    ]
            compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
        else:
            for layer in [
                "attention.self.query",
                "attention.self.key",
                "attention.self.value",
                "attention.output.dense",
                "attention.output.LayerNorm",
                "intermediate.dense",
                "output.dense",
                "output.LayerNorm",
            ]:
                for w in ["weight", "bias"]:
                    compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
                        f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
                    ]
        std_idx += 1
    # Language Modeling Head ###s
    if args.model_type == "roberta":
        for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
            compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
        if args.vocab_transform:
            for w in ["weight", "bias"]:
                compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
                compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
    elif args.model_type == "gpt2":
        for w in ["weight", "bias"]:
            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
    print(f"N layers selected for distillation: {std_idx}")
    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@ -0,0 +1,92 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Preprocessing script before training DistilBERT.
 Specific to BERT -> DistilBERT.
 """
 import argparse
 import torch
 from transformers import BertForMaskedLM
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
    )
    parser.add_argument("--model_type", default="bert", choices=["bert"])
    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()
    if args.model_type == "bert":
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = "bert"
    else:
        raise ValueError(f'args.model_type should be "bert".')
    state_dict = model.state_dict()
    compressed_sd = {}
    for w in ["word_embeddings", "position_embeddings"]:
        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
    for w in ["weight", "bias"]:
        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]
    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
        for w in ["weight", "bias"]:
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
            ]
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
            ]
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
            ]
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
            ]
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
            ]
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
            ]
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
            ]
            compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
                f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
            ]
        std_idx += 1
    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
    if args.vocab_transform:
        for w in ["weight", "bias"]:
            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
    print(f"N layers selected for distillation: {std_idx}")
    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@ -0,0 +1,56 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Preprocessing script before training the distilled model.
 """
 import argparse
 import logging
 import pickle
 from collections import Counter
 logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
 logger = logging.getLogger(__name__)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
    )
    parser.add_argument(
        "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
    )
    parser.add_argument(
        "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
    )
    parser.add_argument("--vocab_size", default=30522, type=int)
    args = parser.parse_args()
    logger.info(f"Loading data from {args.data_file}")
    with open(args.data_file, "rb") as fp:
        data = pickle.load(fp)
    logger.info("Counting occurences for MLM.")
    counter = Counter()
    for tk_ids in data:
        counter.update(tk_ids)
    counts = [0] * args.vocab_size
    for k, v in counter.items():
        counts[k] = v
    logger.info(f"Dump to {args.token_counts_dump}")
    with open(args.token_counts_dump, "wb") as handle:
        pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@ -0,0 +1,322 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Training the distilled model.
 Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
 """
 import argparse
 import json
 import os
 import pickle
 import shutil
 import numpy as np
 import torch
 from distiller import Distiller
 from lm_seqs_dataset import LmSeqsDataset
 from transformers import (
    BertConfig,
    BertForMaskedLM,
    BertTokenizer,
    DistilBertConfig,
    DistilBertForMaskedLM,
    DistilBertTokenizer,
    GPT2Config,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    RobertaConfig,
    RobertaForMaskedLM,
    RobertaTokenizer,
 )
 from utils import git_log, init_gpu_params, logger, set_seed
 MODEL_CLASSES = {
    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
 }
 def sanity_checks(args):
    """
    A bunch of args sanity checks to perform even starting...
    """
    assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
    assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
    if args.mlm:
        assert os.path.isfile(args.token_counts)
        assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
    else:
        assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])
    assert args.teacher_type == args.student_type or (
        args.student_type == "distilbert" and args.teacher_type == "bert"
    )
    assert os.path.isfile(args.student_config)
    if args.student_pretrained_weights is not None:
        assert os.path.isfile(args.student_pretrained_weights)
    if args.freeze_token_type_embds:
        assert args.student_type in ["roberta"]
    assert args.alpha_ce >= 0.0
    assert args.alpha_mlm >= 0.0
    assert args.alpha_clm >= 0.0
    assert args.alpha_mse >= 0.0
    assert args.alpha_cos >= 0.0
    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0
 def freeze_pos_embeddings(student, args):
    if args.student_type == "roberta":
        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
    elif args.student_type == "gpt2":
        student.transformer.wpe.weight.requires_grad = False
 def freeze_token_type_embeddings(student, args):
    if args.student_type == "roberta":
        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
 def main():
    parser = argparse.ArgumentParser(description="Training")
    parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")
    parser.add_argument(
        "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
    )
    parser.add_argument(
        "--data_file",
        type=str,
        required=True,
        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
    )
    parser.add_argument(
        "--student_type",
        type=str,
        choices=["distilbert", "roberta", "gpt2"],
        required=True,
        help="The student type (DistilBERT, RoBERTa).",
    )
    parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
    parser.add_argument(
        "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
    )
    parser.add_argument(
        "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
    )
    parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")
    parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
    parser.add_argument(
        "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
    )
    parser.add_argument(
        "--alpha_mlm",
        default=0.0,
        type=float,
        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.",
    )
    parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
    parser.add_argument(
        "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
    )
    parser.add_argument(
        "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
    )
    parser.add_argument(
        "--mlm_mask_prop",
        default=0.15,
        type=float,
        help="Proportion of tokens for which we need to make a prediction.",
    )
    parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
    parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
    parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
    parser.add_argument(
        "--mlm_smoothing",
        default=0.7,
        type=float,
        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
    )
    parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")
    parser.add_argument(
        "--restrict_ce_to_mask",
        action="store_true",
        help="If true, compute the distilation loss only the [MLM] prediction distribution.",
    )
    parser.add_argument(
        "--freeze_pos_embs",
        action="store_true",
        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
    )
    parser.add_argument(
        "--freeze_token_type_embds",
        action="store_true",
        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
    )
    parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
    parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
    parser.add_argument(
        "--group_by_size",
        action="store_false",
        help="If true, group sequences that have similar length into the same batch. Default is true.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=50,
        help="Gradient accumulation for larger training batches.",
    )
    parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
    parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
    parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
    parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
    parser.add_argument("--seed", type=int, default=56, help="Random seed")
    parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
    parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
    args = parser.parse_args()
    sanity_checks(args)
    # ARGS #
    init_gpu_params(args)
    set_seed(args)
    if args.is_master:
        if os.path.exists(args.dump_path):
            if not args.force:
                raise ValueError(
                    f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it"
                    "Use `--force` if you want to overwrite it"
                )
            else:
                shutil.rmtree(args.dump_path)
        if not os.path.exists(args.dump_path):
            os.makedirs(args.dump_path)
        logger.info(f"Experiment will be dumped and logged in {args.dump_path}")
        # SAVE PARAMS #
        logger.info(f"Param: {args}")
        with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
            json.dump(vars(args), f, indent=4)
        git_log(args.dump_path)
    student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
    teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
    # TOKENIZER #
    tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
    special_tok_ids = {}
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f"Special tokens {special_tok_ids}")
    args.special_tok_ids = special_tok_ids
    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
    # DATA LOADER #
    logger.info(f"Loading data from {args.data_file}")
    with open(args.data_file, "rb") as fp:
        data = pickle.load(fp)
    if args.mlm:
        logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
        with open(args.token_counts, "rb") as fp:
            counts = pickle.load(fp)
        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
        for idx in special_tok_ids.values():
            token_probs[idx] = 0.0  # do not predict special tokens
        token_probs = torch.from_numpy(token_probs)
    else:
        token_probs = None
    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
    logger.info(f"Data loader created.")
    # STUDENT #
    logger.info(f"Loading student config from {args.student_config}")
    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
    stu_architecture_config.output_hidden_states = True
    if args.student_pretrained_weights is not None:
        logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
        student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
    else:
        student = student_model_class(stu_architecture_config)
    if args.n_gpu > 0:
        student.to(f"cuda:{args.local_rank}")
    logger.info(f"Student loaded.")
    # TEACHER #
    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
    if args.n_gpu > 0:
        teacher.to(f"cuda:{args.local_rank}")
    logger.info(f"Teacher loaded from {args.teacher_name}.")
    # FREEZING #
    if args.freeze_pos_embs:
        freeze_pos_embeddings(student, args)
    if args.freeze_token_type_embds:
        freeze_token_type_embeddings(student, args)
    # SANITY CHECKS #
    assert student.config.vocab_size == teacher.config.vocab_size
    assert student.config.hidden_size == teacher.config.hidden_size
    assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
    if args.mlm:
        assert token_probs.size(0) == stu_architecture_config.vocab_size
    # DISTILLER #
    torch.cuda.empty_cache()
    distiller = Distiller(
        params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
    )
    distiller.train()
    logger.info("Let's go get some drinks.")
 if __name__ == "__main__":
    main()
--- a/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
+++ b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
@ -0,0 +1,15 @@
 {
 	"activation": "gelu",
 	"attention_dropout": 0.1,
 	"dim": 768,
 	"dropout": 0.1,
 	"hidden_dim": 3072,
 	"initializer_range": 0.02,
 	"max_position_embeddings": 512,
 	"n_heads": 12,
 	"n_layers": 6,
 	"sinusoidal_pos_embds": true,
 	"tie_weights_": true,
 	"vocab_size": 119547
  }
--- a/examples/distillation/training_configs/distilbert-base-uncased.json
+++ b/examples/distillation/training_configs/distilbert-base-uncased.json
@ -0,0 +1,15 @@
 {
 	"activation": "gelu",
 	"attention_dropout": 0.1,
 	"dim": 768,
 	"dropout": 0.1,
 	"hidden_dim": 3072,
 	"initializer_range": 0.02,
 	"max_position_embeddings": 512,
 	"n_heads": 12,
 	"n_layers": 6,
 	"sinusoidal_pos_embds": true,
 	"tie_weights_": true,
 	"vocab_size": 30522
  }
--- a/examples/distillation/training_configs/distilgpt2.json
+++ b/examples/distillation/training_configs/distilgpt2.json
@ -0,0 +1,10 @@
 {
 	"initializer_range": 0.02,
 	"layer_norm_epsilon": 0.00001,
 	"n_ctx": 1024,
 	"n_embd": 768,
 	"n_head": 12,
 	"n_layer": 6,
 	"n_positions": 1024,
 	"vocab_size": 50257
 }
--- a/examples/distillation/training_configs/distilroberta-base.json
+++ b/examples/distillation/training_configs/distilroberta-base.json
@ -0,0 +1,14 @@
 {
    "vocab_size": 50265,
    "hidden_size": 768,
    "num_hidden_layers": 6,
    "num_attention_heads": 12,
    "intermediate_size": 3072,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "attention_probs_dropout_prob": 0.1,
    "max_position_embeddings": 514,
    "type_vocab_size": 1,
    "initializer_range": 0.02,
    "layer_norm_eps": 0.00001
 }
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@ -0,0 +1,132 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Utils to train DistilBERT
    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
 import json
 import logging
 import os
 import socket
 import git
 import numpy as np
 import torch
 logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
 def git_log(folder_path: str):
    """
    Log commit info.
    """
    repo = git.Repo(search_parent_directories=True)
    repo_infos = {
        "repo_id": str(repo),
        "repo_sha": str(repo.head.object.hexsha),
        "repo_branch": str(repo.active_branch),
    }
    with open(os.path.join(folder_path, "git_log.json"), "w") as f:
        json.dump(repo_infos, f, indent=4)
 def init_gpu_params(params):
    """
    Handle single and multi-GPU / multi-node.
    """
    if params.n_gpu <= 0:
        params.local_rank = 0
        params.master_port = -1
        params.is_master = True
        params.multi_gpu = False
        return
    assert torch.cuda.is_available()
    logger.info("Initializing GPUs")
    if params.n_gpu > 1:
        assert params.local_rank != -1
        params.world_size = int(os.environ["WORLD_SIZE"])
        params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
        params.global_rank = int(os.environ["RANK"])
        # number of nodes / node ID
        params.n_nodes = params.world_size // params.n_gpu_per_node
        params.node_id = params.global_rank // params.n_gpu_per_node
        params.multi_gpu = True
        assert params.n_nodes == int(os.environ["N_NODES"])
        assert params.node_id == int(os.environ["NODE_RANK"])
    # local job (single GPU)
    else:
        assert params.local_rank == -1
        params.n_nodes = 1
        params.node_id = 0
        params.local_rank = 0
        params.global_rank = 0
        params.world_size = 1
        params.n_gpu_per_node = 1
        params.multi_gpu = False
    # sanity checks
    assert params.n_nodes >= 1
    assert 0 <= params.node_id < params.n_nodes
    assert 0 <= params.local_rank <= params.global_rank < params.world_size
    assert params.world_size == params.n_nodes * params.n_gpu_per_node
    # define whether this is the master process / if we are in multi-node distributed mode
    params.is_master = params.node_id == 0 and params.local_rank == 0
    params.multi_node = params.n_nodes > 1
    # summary
    PREFIX = f"--- Global rank: {params.global_rank} - "
    logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes)
    logger.info(PREFIX + "Node ID        : %i" % params.node_id)
    logger.info(PREFIX + "Local rank     : %i" % params.local_rank)
    logger.info(PREFIX + "World size     : %i" % params.world_size)
    logger.info(PREFIX + "GPUs per node  : %i" % params.n_gpu_per_node)
    logger.info(PREFIX + "Master         : %s" % str(params.is_master))
    logger.info(PREFIX + "Multi-node     : %s" % str(params.multi_node))
    logger.info(PREFIX + "Multi-GPU      : %s" % str(params.multi_gpu))
    logger.info(PREFIX + "Hostname       : %s" % socket.gethostname())
    # set GPU device
    torch.cuda.set_device(params.local_rank)
    # initialize multi-GPU
    if params.multi_gpu:
        logger.info("Initializing PyTorch distributed")
        torch.distributed.init_process_group(
            init_method="env://", backend="nccl",
        )
 def set_seed(args):
    """
    Set the random seed.
    """
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@ -1,297 +0,0 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Extract pre-computed feature vectors from a PyTorch BERT model."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import collections
 import logging
 import json
 import re
 import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.modeling import BertModel
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 class InputExample(object):
    def __init__(self, unique_id, text_a, text_b):
        self.unique_id = unique_id
        self.text_a = text_a
        self.text_b = text_b
 class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
        self.unique_id = unique_id
        self.tokens = tokens
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids
 def convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > seq_length - 2:
                tokens_a = tokens_a[0:(seq_length - 2)]
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        input_type_ids = []
        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)
        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                input_type_ids.append(1)
            tokens.append("[SEP]")
            input_type_ids.append(1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)
        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)
        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
        features.append(
            InputFeatures(
                unique_id=example.unique_id,
                tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids))
    return features
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
 def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with open(input_file, "r", encoding='utf-8') as reader:
        while True:
            line = reader.readline()
            if not line:
                break
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
            unique_id += 1
    return examples
 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    ## Other parameters
    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
                            "than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help = "local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    args = parser.parse_args()
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
    layer_indexes = [int(x) for x in args.layers.split(",")]
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    examples = read_examples(args.input_file)
    features = convert_examples_to_features(
        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature
    model = BertModel.from_pretrained(args.bert_model)
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
    model.eval()
    with open(args.output_file, "w", encoding='utf-8') as writer:
        for input_ids, input_mask, example_indices in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
            all_encoder_layers = all_encoder_layers
            for b, example_index in enumerate(example_indices):
                feature = features[example_index.item()]
                unique_id = int(feature.unique_id)
                # feature = unique_id_to_feature[unique_id]
                output_json = collections.OrderedDict()
                output_json["linex_index"] = unique_id
                all_out_features = []
                for (i, token) in enumerate(feature.tokens):
                    all_layers = []
                    for (j, layer_index) in enumerate(layer_indexes):
                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        layers = collections.OrderedDict()
                        layers["index"] = layer_index
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[i]
                        ]
                        all_layers.append(layers)
                    out_features = collections.OrderedDict()
                    out_features["token"] = token
                    out_features["layers"] = all_layers
                    all_out_features.append(out_features)
                output_json["features"] = all_out_features
                writer.write(json.dumps(output_json) + "\n")
 if __name__ == "__main__":
    main()
--- a/examples/hans/hans_processors.py
+++ b/examples/hans/hans_processors.py
@ -0,0 +1,221 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ GLUE processors and helpers """
 import logging
 import os
 from transformers.file_utils import is_tf_available
 from utils_hans import DataProcessor, InputExample, InputFeatures
 if is_tf_available():
    import tensorflow as tf
 logger = logging.getLogger(__name__)
 def hans_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
 ):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: HANS
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True
    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))
    label_map = {label: i for i, label in enumerate(label_list)}
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)
        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
            len(attention_mask), max_length
        )
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
            len(token_type_ids), max_length
        )
        if output_mode == "classification":
            label = label_map[example.label] if example.label in label_map else 0
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)
        pairID = str(example.pairID)
        if ex_index < 10:
            logger.info("*** Example ***")
            logger.info("text_a: %s" % (example.text_a))
            logger.info("text_b: %s" % (example.text_b))
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))
        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
                pairID=pairID,
            )
        )
    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )
        return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )
    return features
 class HansProcessor(DataProcessor):
    """Processor for the HANS data set."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["premise"].numpy().decode("utf-8"),
            tensor_dict["hypothesis"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
    def get_labels(self):
        """See base class."""
        return ["contradiction", "entailment", "neutral"]
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[5]
            text_b = line[6]
            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
            label = line[-1]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
        return examples
 glue_tasks_num_labels = {
    "hans": 3,
 }
 glue_processors = {
    "hans": HansProcessor,
 }
 glue_output_modes = {
    "hans": "classification",
 }
--- a/examples/hans/test_hans.py
+++ b/examples/hans/test_hans.py
@ -0,0 +1,643 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
 from __future__ import absolute_import, division, print_function
 import argparse
 import glob
 import logging
 import os
 import random
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from hans_processors import glue_output_modes as output_modes
 from hans_processors import glue_processors as processors
 from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AlbertConfig,
    AlbertForSequenceClassification,
    AlbertTokenizer,
    BertConfig,
    BertForSequenceClassification,
    BertTokenizer,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    RobertaConfig,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    XLMConfig,
    XLMForSequenceClassification,
    XLMTokenizer,
    XLNetConfig,
    XLNetForSequenceClassification,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
 )
 try:
    from torch.utils.tensorboard import SummaryWriter
 except ImportError:
    from tensorboardX import SummaryWriter
 logger = logging.getLogger(__name__)
 ALL_MODELS = sum(
    (
        tuple(conf.pretrained_config_archive_map.keys())
        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
    ),
    (),
 )
 MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
 }
 def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
 def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value
                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss
                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    # print(json.dumps({**logs, **{'step': global_step}}))
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    if args.local_rank in [-1, 0]:
        tb_writer.close()
    return global_step, tr_loss / global_step
 def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)
        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
        # multi-gpu eval
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if args.model_type in ["bert", "xlnet"] else None
                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
                pair_ids = batch[4].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
                pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0)
        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        output_eval_file = os.path.join(eval_output_dir, "hans_predictions.txt")
        with open(output_eval_file, "w") as writer:
            writer.write("pairID,gld_label\n")
            for pid, pred in zip(pair_ids, preds):
                writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
    return results
 def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
        ),
    )
    label_list = processor.get_labels()
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        examples = (
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
    all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids)
    return dataset, label_list
 def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    # Other parameters
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )
    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()
    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set seed
    set_seed(args)
    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    # Training
    if args.do_train:
        train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    return results
 if __name__ == "__main__":
    main()
--- a/examples/hans/utils_hans.py
+++ b/examples/hans/utils_hans.py
@ -0,0 +1,121 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
 import csv
 import json
 class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
        self.pairID = pairID
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 class InputFeatures(object):
    """
    A single set of features of data.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
        label: Label corresponding to the input
    """
    def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.label = label
        self.pairID = pairID
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """Gets an example from a dict with tensorflow tensors
        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        """
        raise NotImplementedError()
    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()
    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@ -0,0 +1,614 @@
 # coding=utf-8
 # Copyright (c) Facebook, Inc. and its affiliates.
 # Copyright (c) HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
 import argparse
 import glob
 import json
 import logging
 import os
 import random
 import numpy as np
 import torch
 import torch.nn as nn
 from sklearn.metrics import f1_score
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AlbertConfig,
    AlbertModel,
    AlbertTokenizer,
    BertConfig,
    BertModel,
    BertTokenizer,
    DistilBertConfig,
    DistilBertModel,
    DistilBertTokenizer,
    MMBTConfig,
    MMBTForClassification,
    RobertaConfig,
    RobertaModel,
    RobertaTokenizer,
    XLMConfig,
    XLMModel,
    XLMTokenizer,
    XLNetConfig,
    XLNetModel,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
 )
 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
 try:
    from torch.utils.tensorboard import SummaryWriter
 except ImportError:
    from tensorboardX import SummaryWriter
 logger = logging.getLogger(__name__)
 ALL_MODELS = sum(
    (
        tuple(conf.pretrained_config_archive_map.keys())
        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
    ),
    (),
 )
 MODEL_CLASSES = {
    "bert": (BertConfig, BertModel, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
 }
 def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
 def train(args, train_dataset, model, tokenizer, criterion):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=collate_fn,
        num_workers=args.num_workers,
    )
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_f1, n_no_improve = 0, 0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            labels = batch[5]
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
            }
            outputs = model(**inputs)
            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
            loss = criterion(logits, labels)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, criterion)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value
                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss
                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
        if args.local_rank == -1:
            results = evaluate(args, model, tokenizer, criterion)
            if results["micro_f1"] > best_f1:
                best_f1 = results["micro_f1"]
                n_no_improve = 0
            else:
                n_no_improve += 1
            if n_no_improve > args.patience:
                train_iterator.close()
                break
    if args.local_rank in [-1, 0]:
        tb_writer.close()
    return global_step, tr_loss / global_step
 def evaluate(args, model, tokenizer, criterion, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir
    eval_dataset = load_examples(args, tokenizer, evaluate=True)
    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
    )
    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            batch = tuple(t.to(args.device) for t in batch)
            labels = batch[5]
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
            }
            outputs = model(**inputs)
            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
            tmp_eval_loss = criterion(logits, labels)
            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5
            out_label_ids = labels.detach().cpu().numpy()
        else:
            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0)
            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
    eval_loss = eval_loss / nb_eval_steps
    result = {
        "loss": eval_loss,
        "macro_f1": f1_score(out_label_ids, preds, average="macro"),
        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
    }
    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    return result
 def load_examples(args, tokenizer, evaluate=False):
    path = os.path.join(args.data_dir, "dev.jsonl" if evaluate else "train.jsonl")
    transforms = get_image_transforms()
    labels = get_mmimdb_labels()
    dataset = JsonlDataset(path, tokenizer, transforms, labels, args.max_seq_length - args.num_image_embeds - 2)
    return dataset
 def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    # Other parameters
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )
    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()
    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set seed
    set_seed(args)
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    # Setup model
    labels = get_mmimdb_labels()
    num_labels = len(labels)
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    transformer_config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    transformer = model_class.from_pretrained(
        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
    )
    img_encoder = ImageEncoder(args)
    config = MMBTConfig(transformer_config, num_labels=num_labels)
    model = MMBTForClassification(config, transformer, img_encoder)
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    # Training
    if args.do_train:
        train_dataset = load_examples(args, tokenizer, evaluate=False)
        label_frequences = train_dataset.get_label_frequencies()
        label_frequences = [label_frequences[l] for l in labels]
        label_weights = (
            torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
        ) ** -1
        criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        # Load a trained model and vocabulary that you have fine-tuned
        model = MMBTForClassification(config, transformer, img_encoder)
        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
            model = MMBTForClassification(config, transformer, img_encoder)
            model.load_state_dict(torch.load(checkpoint))
            model.to(args.device)
            result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    return results
 if __name__ == "__main__":
    main()
--- a/examples/mm-imdb/utils_mmimdb.py
+++ b/examples/mm-imdb/utils_mmimdb.py
@ -0,0 +1,143 @@
 # coding=utf-8
 # Copyright (c) Facebook, Inc. and its affiliates.
 # Copyright (c) HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
 import os
 from collections import Counter
 import torch
 import torch.nn as nn
 import torchvision
 import torchvision.transforms as transforms
 from PIL import Image
 from torch.utils.data import Dataset
 POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
 class ImageEncoder(nn.Module):
    def __init__(self, args):
        super().__init__()
        model = torchvision.models.resnet152(pretrained=True)
        modules = list(model.children())[:-2]
        self.model = nn.Sequential(*modules)
        self.pool = nn.AdaptiveAvgPool2d(POOLING_BREAKDOWN[args.num_image_embeds])
    def forward(self, x):
        # Bx3x224x224 -> Bx2048x7x7 -> Bx2048xN -> BxNx2048
        out = self.pool(self.model(x))
        out = torch.flatten(out, start_dim=2)
        out = out.transpose(1, 2).contiguous()
        return out  # BxNx2048
 class JsonlDataset(Dataset):
    def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
        self.data = [json.loads(l) for l in open(data_path)]
        self.data_dir = os.path.dirname(data_path)
        self.tokenizer = tokenizer
        self.labels = labels
        self.n_classes = len(labels)
        self.max_seq_length = max_seq_length
        self.transforms = transforms
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
        sentence = sentence[: self.max_seq_length]
        label = torch.zeros(self.n_classes)
        label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
        image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
        image = self.transforms(image)
        return {
            "image_start_token": start_token,
            "image_end_token": end_token,
            "sentence": sentence,
            "image": image,
            "label": label,
        }
    def get_label_frequencies(self):
        label_freqs = Counter()
        for row in self.data:
            label_freqs.update(row["label"])
        return label_freqs
 def collate_fn(batch):
    lens = [len(row["sentence"]) for row in batch]
    bsz, max_seq_len = len(batch), max(lens)
    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
        text_tensor[i_batch, :length] = input_row["sentence"]
        mask_tensor[i_batch, :length] = 1
    img_tensor = torch.stack([row["image"] for row in batch])
    tgt_tensor = torch.stack([row["label"] for row in batch])
    img_start_token = torch.stack([row["image_start_token"] for row in batch])
    img_end_token = torch.stack([row["image_end_token"] for row in batch])
    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, tgt_tensor
 def get_mmimdb_labels():
    return [
        "Crime",
        "Drama",
        "Thriller",
        "Action",
        "Comedy",
        "Romance",
        "Documentary",
        "Short",
        "Mystery",
        "History",
        "Family",
        "Adventure",
        "Fantasy",
        "Sci-Fi",
        "Western",
        "Horror",
        "Sport",
        "War",
        "Music",
        "Musical",
        "Animation",
        "Biography",
        "Film-Noir",
    ]
 def get_image_transforms():
    return transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
        ]
    )
--- a/examples/pplm/README.md
+++ b/examples/pplm/README.md
@ -0,0 +1,54 @@
 # Plug and Play Language Models: a Simple Approach to Controlled Text Generation
 Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
 This folder contains the original code used to run the Plug and Play Language Model (PPLM).
 Paper link: https://arxiv.org/abs/1912.02164
 Blog link: https://eng.uber.com/pplm
 Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
 ## Setup
 ```bash
 git clone https://github.com/huggingface/transformers && cd transformers
 pip install .
 pip install nltk torchtext # additional requirements.
 cd examples/pplm
 ```
 ## PPLM-BoW 
 ### Example command for bag-of-words control
 ```bash
 python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
 ```
 ### Tuning hyperparameters for bag-of-words control
 1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
 2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
 	a) Reduce the `--stepsize` </br>
 	b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
 	c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
 ## PPLM-Discrim
 ### Example command for discriminator based sentiment control
 ```bash
 python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
 ```
 ### Tuning hyperparameters for discriminator control
 1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
 2. Use `--class_label 3` for negative, and `--class_label 2` for positive
--- a/examples/pplm/imgs/headfigure.png
+++ b/examples/pplm/imgs/headfigure.png
--- a/examples/pplm/imgs/wooly.png
+++ b/examples/pplm/imgs/wooly.png
--- a/examples/pplm/pplm_classification_head.py
+++ b/examples/pplm/pplm_classification_head.py
@ -0,0 +1,19 @@
 import torch
 class ClassificationHead(torch.nn.Module):
    """Classification Head for  transformer encoders"""
    def __init__(self, class_size, embed_size):
        super().__init__()
        self.class_size = class_size
        self.embed_size = embed_size
        # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
        # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
        self.mlp = torch.nn.Linear(embed_size, class_size)
    def forward(self, hidden_state):
        # hidden_state = F.relu(self.mlp1(hidden_state))
        # hidden_state = self.mlp2(hidden_state)
        logits = self.mlp(hidden_state)
        return logits
--- a/Show More
+++ b/Show More