Push conda-build on branch

Fix conda-build
Release: v4.3.1
2025-10-20 17:13:56 +08:00 · 2021-02-26 20:19:22 -05:00 · 2021-02-26 20:14:39 -05:00 · 2021-02-09 09:55:55 +01:00 · 2021-02-09 09:55:55 +01:00 · 2021-02-08 18:31:49 +01:00
1618 changed files with 112454 additions and 65073 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -3,6 +3,7 @@ orbs:
    gcp-gke: circleci/gcp-gke@1.0.4
    go: circleci/go@1.3.0

+
 # TPU REFERENCES
 references:
    checkout_ml_testing: &checkout_ml_testing
@ -77,7 +78,8 @@ jobs:
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing]
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@ -103,7 +105,8 @@ jobs:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing]
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@ -129,7 +132,7 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@ -155,7 +158,7 @@ jobs:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,sklearn,torch,testing]
+            - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
            - save_cache:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
@ -181,7 +184,8 @@ jobs:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing]
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@ -207,7 +211,7 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@ -221,7 +225,7 @@ jobs:
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.6
+            - image: circleci/python:3.7
        environment:
            RUN_CUSTOM_TOKENIZERS: yes
        steps:
@ -231,7 +235,7 @@ jobs:
                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[ja,testing]
+            - run: pip install .[ja,testing,sentencepiece]
            - run: python -m unidic download
            - save_cache:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@ -258,8 +262,8 @@ jobs:
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing]
-            - run: pip install -r examples/requirements.txt
+            - run: pip install .[sklearn,torch,sentencepiece,testing]
+            - run: pip install -r examples/_tests_requirements.txt
            - save_cache:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
@ -270,6 +274,22 @@ jobs:
            - store_artifacts:
                  path: ~/transformers/reports

+    run_tests_git_lfs:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo apt-get install git-lfs
+            - run: |
+                git config --global user.email "ci@dummy.com"
+                git config --global user.name "ci"
+            - run: pip install --upgrade pip
+            - run: pip install .[testing]
+            - run: RUN_GIT_LFS_TESTS=1 python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
+
    build_doc:
        working_directory: ~/transformers
        docker:
@ -324,7 +344,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install isort
-            - run: pip install .[tf,torch,flax,quality]
+            - run: pip install .[all,quality]
            - save_cache:
                  key: v0.4-code_quality-{{ checksum "setup.py" }}
                  paths:
@ -334,6 +354,7 @@ jobs:
            - run: flake8 examples tests src utils
            - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
            - run: python utils/check_copies.py
+            - run: python utils/check_table.py
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py

@ -397,17 +418,18 @@ workflows:
            - run_tests_flax
            - run_tests_pipelines_torch
            - run_tests_pipelines_tf
+            - run_tests_git_lfs
            - build_doc
            - deploy_doc: *workflow_filters
-    tpu_testing_jobs:
-        triggers:
-            - schedule:
-                # Set to run at the first minute of every hour.
-                cron: "0 8 * * *"
-                filters:
-                    branches:
-                        only:
-                            - master
-        jobs:
-            - cleanup-gke-jobs
-            - run_examples_tpu
+#    tpu_testing_jobs:
+#        triggers:
+#            - schedule:
+#                # Set to run at the first minute of every hour.
+#                cron: "0 8 * * *"
+#                filters:
+#                    branches:
+#                        only:
+#                            - master
+#        jobs:
+#            - cleanup-gke-jobs
+#            - run_examples_tpu
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@ -52,4 +52,7 @@ deploy_doc "4b3ee9c" v3.1.0
 deploy_doc "3ebb1b3" v3.2.0
 deploy_doc "0613f05" v3.3.1
 deploy_doc "eb0e0ce" v3.4.0
-deploy_doc "818878d" # v3.5.1 Latest stable release
+deploy_doc "818878d" v3.5.1
+deploy_doc "c781171" v4.0.0
+deploy_doc "bfa4ccf" v4.1.1
+deploy_doc "7d9a9d0" # v4.2.0 Latest stable release
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@ -11,7 +11,7 @@ assignees: ''
 ## Environment info
 <!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
     Don't forget to fill out the missing fields in that output! -->
-     
+
 - `transformers` version:
 - Platform:
 - Python version:
@ -24,32 +24,41 @@ assignees: ''
 <!-- Your issue will be replied to more quickly if you can figure out the right person to tag with @
 If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
 Please tag fewer than 3 people.
- 
- albert, bert, GPT2, XLM: @LysandreJik 
- tokenizers: @mfuntowicz
- Trainer: @sgugger
- Speed and Memory Benchmarks: @patrickvonplaten
- Model Cards: @julien-c
- TextGeneration: @TevenLeScao 
- examples/distillation: @VictorSanh
- nlp datasets: [different repo](https://github.com/huggingface/nlp)
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
- Text Generation: @patrickvonplaten @TevenLeScao
- Blenderbot: @patrickvonplaten
- Bart: @patrickvonplaten
- Marian: @patrickvonplaten
- Pegasus: @patrickvonplaten
- mBART: @patrickvonplaten
- T5: @patrickvonplaten
- Longformer/Reformer: @patrickvonplaten
- TransfoXL/XLNet: @TevenLeScao
- RAG: @patrickvonplaten, @lhoestq
- FSMT: @stas00
- examples/seq2seq: @patil-suraj
- examples/bert-loses-patience: @JetRunner
- tensorflow: @jplu
- examples/token-classification: @stefan-it
- documentation: @sgugger
+
+Models:
+
+- albert, bert, xlm: @LysandreJik
+- blenderbot, bart, marian, pegasus, encoderdecoder,  t5: @patrickvonplaten, @patil-suraj
+- longformer, reformer, transfoxl, xlnet: @patrickvonplaten
+- fsmt: @stas00
+- funnel: @sgugger
+- gpt2: @patrickvonplaten, @LysandreJik
+- rag: @patrickvonplaten, @lhoestq
+- tensorflow: @jplu
+
+Library:
+
+- benchmarks: @patrickvonplaten
+- deepspeed: @stas00
+- ray/raytune: @richardliaw, @amogkam
+- text generation: @patrickvonplaten
+- tokenizers: @n1t0, @LysandreJik
+- trainer: @sgugger
+- pipelines: @LysandreJik
+
+Documentation: @sgugger
+
+HF projects:
+
+- nlp datasets: [different repo](https://github.com/huggingface/nlp)
+- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+
+Examples:
+
+- maintained examples (not research project or legacy): @sgugger, @patil-suraj
+- research_projects/bert-loses-patience: @JetRunner
+- research_projects/distillation: @VictorSanh
+
 -->

 ## Information
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -37,26 +37,38 @@ members/contributors which may be interested in your PR.
 If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
 Please tag fewer than 3 people.

- albert, bert, XLM: @LysandreJik
- GPT2: @LysandreJik, @patrickvonplaten
- tokenizers: @mfuntowicz
- Trainer: @sgugger
- Benchmarks: @patrickvonplaten
- Model Cards: @julien-c
- examples/distillation: @VictorSanh
- nlp datasets: [different repo](https://github.com/huggingface/nlp)
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
- Text Generation: @patrickvonplaten, @TevenLeScao
- Blenderbot, Bart, Marian, Pegasus: @patrickvonplaten
- T5: @patrickvonplaten
- Rag: @patrickvonplaten, @lhoestq
- EncoderDecoder: @patrickvonplaten
- Longformer, Reformer: @patrickvonplaten
- TransfoXL, XLNet: @TevenLeScao, @patrickvonplaten
- examples/seq2seq: @patil-suraj
- examples/bert-loses-patience: @JetRunner
- tensorflow: @jplu
- examples/token-classification: @stefan-it
- documentation: @sgugger
- FSTM: @stas00
+Models:
+
+- albert, bert, xlm: @LysandreJik
+- blenderbot, bart, marian, pegasus, encoderdecoder,  t5: @patrickvonplaten, @patil-suraj
+- longformer, reformer, transfoxl, xlnet: @patrickvonplaten
+- fsmt: @stas00
+- funnel: @sgugger
+- gpt2: @patrickvonplaten, @LysandreJik
+- rag: @patrickvonplaten, @lhoestq
+- tensorflow: @jplu
+
+Library:
+
+- benchmarks: @patrickvonplaten
+- deepspeed: @stas00
+- ray/raytune: @richardliaw, @amogkam
+- text generation: @patrickvonplaten
+- tokenizers: @n1t0, @LysandreJik
+- trainer: @sgugger
+- pipelines: @LysandreJik
+
+Documentation: @sgugger
+
+HF projects:
+
+- nlp datasets: [different repo](https://github.com/huggingface/nlp)
+- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+
+Examples:
+
+- maintained examples (not research project or legacy): @sgugger, @patil-suraj
+- research_projects/bert-loses-patience: @JetRunner
+- research_projects/distillation: @VictorSanh
+
 -->
--- a/.github/conda/build.sh
+++ b/.github/conda/build.sh
@ -0,0 +1 @@
+$PYTHON setup.py install     # Python command to install the script.
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@ -0,0 +1,48 @@
+{% set name = "transformers" %}
+
+package:
+  name: "{{ name|lower }}"
+  version: "{{ TRANSFORMERS_VERSION }}"
+
+source:
+  path: ../../
+
+build:
+  noarch: python
+
+requirements:
+  host:
+    - python
+    - pip
+    - numpy >=1.17
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers >=0.10.1,<0.11.0
+  run:
+    - python
+    - numpy >=1.17
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers >=0.10.1,<0.11.0
+
+test:
+  imports:
+    - transformers
+
+about:
+  home: https://huggingface.co
+  license: Apache License 2.0
+  license_file: LICENSE
+  summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
--- a/.github/stale.yml
+++ b/.github/stale.yml
@ -6,6 +6,7 @@ daysUntilClose: 7
 exemptLabels:
  - pinned
  - security
+  - Feature request
 # Label to use when marking an issue as stale
 staleLabel: wontfix
 # Comment to post when marking an issue as stale. Set to `false` to disable
--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@ -1,6 +1,6 @@
 name: Torch hub integration

-on: 
+on:
  push:
    branches:
      - "*"
@ -32,8 +32,10 @@ jobs:
    - name: Install dependencies
      run: |
        pip install --upgrade pip
-        pip install torch
-        pip install numpy filelock protobuf requests tqdm regex sentencepiece sacremoses tokenizers packaging
+        # install torch-hub specific dependencies
+        pip install -e git+https://github.com/huggingface/transformers.git#egg=transformers[torchhub]
+        # no longer needed
+        pip uninstall -y transformers

    - name: Torch hub list
      run: |
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@ -0,0 +1,70 @@
+name: Model templates runner
+
+on:
+  push:
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+  pull_request_target:
+    branches:
+      - master
+
+jobs:
+  run_tests_templates:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.6
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: ~/.cache/pip
+          key: v1.2-tests_templates
+          restore-keys: |
+            v1.2-tests_templates-${{ hashFiles('setup.py') }}
+            v1.2-tests_templates
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[dev]
+      - name: Create model files
+        run: |
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          make style
+          python utils/check_table.py --fix_and_overwrite
+          python utils/check_dummies.py --fix_and_overwrite
+
+      - name: Run all non-slow tests
+        run: |
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_templates tests/*template*
+
+      - name: Run style changes
+        run: |
+          git fetch origin master:master
+          make fixup
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_templates_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_templates_test_reports
+          path: reports
--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@ -0,0 +1,46 @@
+name: Release - Conda
+
+on:
+  push:
+    tags:
+      - v*
+    branches:
+      - v*
+
+env:
+  ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+
+jobs:
+  build_and_package:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          auto-activate-base: false
+          activate-environment: "build-transformers"
+          channels: huggingface
+
+      - name: Setup conda env
+        run: |
+          conda install -c defaults anaconda-client conda-build
+
+      - name: Extract version
+        run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
+
+      - name: Build conda packages
+        run: |
+          conda info
+          conda list
+          conda-build .github/conda
+
+      - name: Upload to Anaconda
+        run: anaconda upload `conda-build .github/conda --output` --force
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -4,7 +4,7 @@ on:
  push:
    branches:
      - master
-      - model-templates
+      - ci_*
    paths:
      - "src/**"
      - "tests/**"
@ -50,6 +50,7 @@ jobs:
          pip install --upgrade pip
          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
@ -57,13 +58,13 @@ jobs:
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

-      - name: Create model files
-        run: |
-          source .env/bin/activate
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#      - name: Create model files
+#        run: |
+#          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model

      - name: Run all non-slow tests on GPU
        env:
@ -129,10 +130,10 @@ jobs:
      - name: Create model files
        run: |
          source .env/bin/activate
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
-          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model

      - name: Run all non-slow tests on GPU
        env:
@ -187,6 +188,7 @@ jobs:
          pip install --upgrade pip
          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -6,10 +6,6 @@
 name: Self-hosted runner (scheduled)

 on:
-  push:
-    branches:
-      - ci_*
-      - framework-agnostic-tokenizers
  repository_dispatch:
  schedule:
    - cron: "0 0 * * *"
@ -79,7 +75,7 @@ jobs:
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
-          pip install -r examples/requirements.txt
+          pip install -r examples/_tests_requirements.txt
          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples

      - name: Failure short reports
--- a/.gitignore
+++ b/.gitignore
@ -159,3 +159,6 @@ tags

 # pre-commit
 .pre-commit*
+
+# .lock
+*.lock
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # How to contribute to transformers?

 Everyone is welcome to contribute, and we value everybody's contribution. Code
@ -125,7 +141,7 @@ Follow these steps to start contributing:
   $ git checkout -b a-descriptive-name-for-my-changes
   ```

-   **do not** work on the `master` branch.
+   **Do not** work on the `master` branch.

 4. Set up a development environment by running the following command in a virtual environment:

@ -312,8 +328,28 @@ for more information.

 ### Develop on Windows

+On windows, you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
+
+`git config core.autocrlf input`
+
 One way one can run the make command on Window is to pass by MSYS2:

 1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
 2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
 3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
+4. Add `C:\msys64\usr\bin` to your PATH environment variable.
+
+You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
+
+### Syncing forked master with upstream (HuggingFace) master
+
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs, 
+when syncing the master branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream master
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```
--- a/ISSUES.md
+++ b/ISSUES.md
@ -0,0 +1,275 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# How To Request Support
+
+This is an Open Source Project so please be mindful that like in any other project of this kind there is no obligation to answer all requests for help.
+
+However, we want to encourage you to ask for help whenever you think it's needed! We are happy about every  question we get because it allows us to better understand your needs, possible misunderstandings, and most importantly a way for you to help us make this library better. That being said, this document's main purpose is to provide guidelines at how you can formulate your requests to increase your chances to be understood and to get support.
+
+There are two main venues to receive support: [the forums](https://discuss.huggingface.co/) and [the GitHub issues](https://github.com/huggingface/transformers/issues).
+
+## The Forums
+
+[The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.
+
+If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
+
+In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:
+
+* "I would like to use a BertModel within a RL-Agent for a customer support service. How can I use a BertForMaskedLM in my ChatBotModel?"
+
+* "Could you please explain why T5 has no positional embedding matrix under T5Model?"
+
+* "How should I set my generation parameters for translation?"
+
+* "How to train T5 on De->En translation?"
+
+
+## The GitHub Issues
+
+Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
+
+You are not required to read the following guidelines before opening an issue. However, if you notice that your issue doesn't get any replies, chances are that the developers have one or several difficulties with its quality. In this case, reading the following points and adjusting your issue accordingly could help.
+
+1. Before posting an issue, first search for already posted issues, since chances are someone has already asked a similar question before you.
+
+    If you use Google your search query should be:
+
+    ```
+    "huggingface" "transformers" your query
+    ```
+
+    The first two quoted words tell Google to limit the search to the context of the Huggingface Transformers. The remainder is your query - most commonly this would be the error message the software fails with. We will go deeper into details shortly.
+
+    The results of such a query will typically match GitHub issues, Hugging Face forums, StackExchange, and blogs.
+
+    If you find relevant hints, you may choose to continue the discussion there if you have follow up questions.
+
+    If what you found is similar but doesn't quite answer your problem, please, post a new issue and do include links to similar issues or forum discussions you may have found.
+
+    Let's look at some examples:
+
+    The error message, often referred to as an assertion, tells us what went wrong. Here is an example of an assertion:
+
+   ```python
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+     File "/transformers/src/transformers/__init__.py", line 34, in <module>
+       from . import dependency_versions_check
+     File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
+       from .file_utils import is_tokenizers_available
+     File "/transformers/src/transformers/file_utils.py", line 40, in <module>
+       from tqdm.auto import tqdm
+    ModuleNotFoundError: No module named 'tqdm.auto'
+    ```
+
+   and it typically includes a traceback, so that we can see the full stack of calls the program made before it fails. This gives us the context to know why the program failed.
+
+   Going back to the above example. If you received this error search, look at the very last line of the error which is:
+
+   ```python
+    ModuleNotFoundError: No module named 'tqdm.auto'
+    ```
+
+    And now we can use it to do the searching on your favorite search engine:
+
+    1. first for `"huggingface" "transformers" "ModuleNotFoundError: No module named 'tqdm.auto'"`
+    2. if you don't find relevant results, then search for just `"ModuleNotFoundError: No module named 'tqdm.auto'"`
+    3. and finally if nothing still comes up, then remove the outside quotes: `ModuleNotFoundError: No module named 'tqdm.auto'`
+
+   If the error includes any messages that include bits unique to your filesystem, always remove those in the search query since other users will not have the same filesystem as yours. For example:
+
+   ```bash
+   python -c 'open("/tmp/wrong_path.txt", "r")'
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+   FileNotFoundError: [Errno 2] No such file or directory: '/tmp/wrong_path.txt'
+   ```
+   Here you'd search for just: `"FileNotFoundError: [Errno 2] No such file or directory"`
+
+   If the local information that you removed were inside the error message and you removed them you may need to remove double quotes since your query is no longer exact. So if the error message was something like:
+
+   ```bash
+      ValueError: '/tmp/wrong_path.txt' cannot be found
+   ```
+
+   then you'd search for `"ValueError" "cannot be found"`
+
+   As you search you will notice that when you don't use quotes often the search engines will return a variety of unrelated hits, which may or may not be what you want.
+
+   Experiment with different ways and find which approach gives the most satisfactory results.
+
+2. Keep the issue short, providing the information that you think will aid the developers to understand your situation. Put yourself in the shoes of the person who has never seen your code or knows anything about your custom setup. This mental exercise will help to develop an intuition to what/what not to share"
+
+3. If there is a software failure, always provide the full traceback, for example:
+
+   ```python
+   $ python -c 'import transformers'
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+     File "/transformers/src/transformers/__init__.py", line 34, in <module>
+       from . import dependency_versions_check
+     File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
+       from .file_utils import is_tokenizers_available
+     File "/transformers/src/transformers/file_utils.py", line 40, in <module>
+       from tqdm.auto import tqdm
+   ModuleNotFoundError: No module named 'tqdm.auto'
+   ```
+
+   As compared to providing just the last line of the error message, e.g.:
+   ```python
+   ModuleNotFoundError: No module named 'tqdm.auto'
+   ```
+   which is not sufficient.
+
+   If your application is running on more than one GPU (e.g. under `DistributedDataParallel`) and typically getting every log and traceback printed multiple times, please make sure that you paste only one copy of it. At times the traceback from parallel processes may get interleaved - so either disentangle these or change the loggers to log only for `local_rank==0` so that only one process logs things.
+
+4. When quoting a traceback, command line instructions and any type of code always enclose it in triple backticks inside the editor window, that is:
+
+   ````
+   ```
+   git clone https://github.com/huggingface/transformers
+   cd transformers
+   pip install .
+   ```
+   ````
+
+   If it's a command line with a long argument list, please consider breaking it down using backslashes and new lines. Here is an example of a good command line quote:
+
+   ```bash
+    cd examples/seq2seq
+    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation \
+    --fp16 --sharded_ddp
+   ```
+
+   If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening.
+
+   The backslashes allow us to copy the command directly into the console to run it, without needing to edit it.
+
+5. Include only the important information that you think will help the developer to quickly identify the problem.
+
+   For example applications often create huge amounts of logs. Ask yourself whether providing all or parts of the log is useful.
+
+   Pasting a 100-1000 lines of log into the issue is an immediate turn off, since it will take a lot of time to figure out where the pertinent parts of the log are.
+
+   Attaching a full log can be helpful if it's done as an attachment, if it's enclosed in the following html code in the comment editor window:
+
+   ```
+   <details>
+   <summary>Full log</summary>
+   <pre>
+
+   many
+   lines
+   go
+   here
+
+   </pre>
+   </details>
+   ```
+
+   which would result in the following entry, which can be opened if desired, but otherwise takes little space.
+
+   <details>
+   <summary>Full log</summary>
+   <pre>
+   many
+   lines
+   go
+   here
+   </pre>
+   </details>
+
+    You could also provide a link to a pastebin service, but this is less beneficial since those links tend to expire quickly and future readers of your issue might not be able to access that log file anymore and may lack some context.
+
+6. If this is an issue in your code, do try to reduce that code to a minimal example that still demonstrates the problem. Please ask at the forums if you have a hard time figuring how to do that. Please realize that we don't have the luxury of having time to try and understand all of your custom code.
+
+   If you really tried to make a short reproducible code but couldn't figure it out, it might be that having a traceback will give the developer enough information to know what's going on. But if it is not enough and we can't reproduce the problem, we can't really solve it.
+
+   Do not dispair if you can't figure it out from the begining, just share what you can and perhaps someone else will be able to help you at the forums.
+
+7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.
+
+8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.
+
+   We understand that this is not always possible, especially when APIs change, in which case file an issue against the highest library version your environment can support.
+
+   Of course, if you upgrade the library, always retest that the problem is still there.
+
+9. Please do not ask us to reproduce an issue with your custom data, since we don't have it. So, either you should use some existing dataset supported by HF datasets or you need to supply a code that generates a small sample on the fly, or some another quick and simple way to get it.
+
+   Please do not send us any non-public domain data that may require a license or a permission to be used.
+
+10. Do not tag multiple developers on the issue unless you know this is expected, either because you asked them and they gave you an explicit permission to tag them or the issue template instructs you to do so.
+
+   The "who to tag for what domain" part of the issue template is there to help users direct their questions to the right developers who are designated maintainers of project's specific domains. They can then decide at their own discretion to tag other developers if they feel it'd help move the issue forward.
+
+   We currently don't have a triage service and we trust your capacity to identify the right domain and thus the persons to tag in your issue. If you are not sure, please use the forums to ask for guidance.
+
+   When in doubt, err on the side of not tagging a given person. If you tag multiple people out of context or permission don't be surprised if you get no response at all. Please remember that every time you tag someone, they get a notification and you're taking their time without their permission. Please be sensitive to that.
+
+   If you got helped by one of the developers in the past please don't tag them in future issues, unless they are listed in the issue template for the domain you are asking about or that developer gave you an explicit permission to tag them in future issues.
+
+   If you see a certain developer doing multiple and/or recent commits into a specific area of the project that you feel is relevant to your issue, it is not a good reason to tag them. Various developers may be fixing things that prevent them from moving forward, but often their work is focused on a totally different domain. And while they may or may not know how to help you with the problem at hand, it would benefit the whole community much more if they focus on the domain of their unique expertise.
+
+11. Use the Edit button. Take your time, and re-read and improve the wording and formatting to make your posts and comments as easy to understand as possible.
+
+    Avoid posting multiple comments in a row, as each comment generates a notification for the developers tagged in that issue. If you happened to post multiple comments in a row, and nobody followed up yet - consider merging those into one or a few comments while editing the combined content to be coherent.
+
+    If you choose to edit your older comments after others posted follow up comments you need to be aware that your modifications might not be noticed, so if it's not a typo fixing, try to write a new comment flagging that something has been changed in the previous comments.
+
+    For example, the very first comment is the most important one. If while the thread unfolds you realize that things aren't as they seemed to you originally you may want to edit the first post to reflect the up-to-date understanding of the issue at hand so that it helps those who read your issue in the future quickly understand what's going on and not need to sift through dozens of comments. It also helps to indicate that the post was edited. So, those reading the thread later can understand why there might be certain discontinuity in the information flow.
+
+    Use bullets and items if you have lists of items and the outcome improves overall readability.
+
+    Use backticks to refer to class and function names, e.g. `BartModel` and `generate` as these stand out and improve the speed of a reader's comprehension.
+
+    Try not use italics and bold text too much as these often make the text more difficult to read.
+
+
+12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
+
+    To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
+
+    For example the first link is a link to an issue, and the second to a specific comment in the same issue:
+
+    1. https://github.com/huggingface/transformers/issues/9257
+    2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
+
+
+13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
+
+    But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
+
+    ```
+    > How big is your gpu cluster?
+
+    Our cluster is made of 256 gpus.
+    ```
+
+    If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
+
+In general the best way to figure out what works the best is learn from issues posted by other people - see which issues get great responses and which get little to no response - observe what the posters who received great responses did differently from those who did not.
+
+Thank you for reading this somewhat lengthy document. We would like to conclude that these are not absolute rules, but a friendly advice that will help maximize the chances for us to understand what you are trying to communicate, reproduce the problem then resolve it to your satisfaction and the benefit of the whole community.
+
+If after reading this document there are remaining questions on how and why or there is a need for further elucidation, please, don't hesitate to ask your question in [this thread](https://discuss.huggingface.co/t/how-to-request-support/3128).
--- a/1
+++ b/1
@ -1,3 +1,4 @@
+Copyright 2018- The Hugging Face team. All rights reserved.

                                 Apache License
                           Version 2.0, January 2004
--- a/15
+++ b/15
@ -1,4 +1,4 @@
-.PHONY: modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs
+.PHONY: deps_table_update modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs


 check_dirs := examples tests src utils
@ -14,10 +14,16 @@ modified_only_fixup:
 		echo "No library .py files were modified"; \
 	fi

+# Update src/transformers/dependency_versions_table.py
+
+deps_table_update:
+	@python setup.py deps_table_update
+
 # Check that source code meets quality standards

-extra_quality_checks:
+extra_quality_checks: deps_table_update
 	python utils/check_copies.py
+	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/style_doc.py src/transformers docs/source --max_len 119
@ -32,7 +38,7 @@ quality:

 # Format source code automatically and check is there are any problems left that need manual fixing

-style:
+style: deps_table_update
 	black $(check_dirs)
 	isort $(check_dirs)
 	python utils/style_doc.py src/transformers docs/source --max_len 119
@ -45,6 +51,7 @@ fixup: modified_only_fixup extra_quality_checks

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
+	python utils/check_table.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite

 # Run tests for the library
@ -60,4 +67,4 @@ test-examples:
 # Check that docs can build

 docs:
-	cd docs && make html SPHINXOPTS="-W"
+	cd docs && make html SPHINXOPTS="-W -j 4"
--- a/README.md
+++ b/README.md
@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 <p align="center">
    <br>
    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
@ -31,12 +47,9 @@

 🤗 Transformers is backed by the two most popular deep learning libraries, [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/), with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.

-### Recent contributors
-[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)
-
 ## Online demos

-You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer an [inference API](https://huggingface.co/pricing) to use those models.
+You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) to use those models.

 Here are a few examples:
 - [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
@ -137,14 +150,16 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta

 ## Installation

+### With pip
+
 This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

 First, create a virtual environment with the version of Python you're going to use and activate it.

-Then, you will need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+Then, you will need to install at least one of TensorFlow 2.0, PyTorch or Flax.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform and/or [Flax installation page](https://github.com/google/flax#quick-install).

 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:

@ -152,18 +167,38 @@ When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be
 pip install transformers
 ```

-If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).
+If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).
+
+### With conda
+
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```shell script
+conda install -c huggingface transformers
+```
+
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.

 ## Models architectures

+**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
+
+Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/transformers/model_summary.html) for a high-level summary of each them):

 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
@ -177,26 +212,30 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

+To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
+
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).


@ -214,13 +253,17 @@ These implementations have been tested on several datasets (see the example scri

 ## Citation

-We now have a [paper](https://arxiv.org/abs/1910.03771) you can cite for the 🤗 Transformers library:
+We now have a [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) you can cite for the 🤗 Transformers library:
 ```bibtex
-@article{Wolf2019HuggingFacesTS,
-  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
-  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
-  journal={ArXiv},
-  year={2019},
-  volume={abs/1910.03771}
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
 }
 ```
--- a/docs/README.md
+++ b/docs/README.md
@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Generating the documentation

 To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@ -2,6 +2,15 @@

 /* Colab dropdown */

+table.center-aligned-table td {
+    text-align: center;
+}
+
+table.center-aligned-table th {
+    text-align: center;
+    vertical-align: middle;
+}
+
 .colab-dropdown {
    position: relative;
    display: inline-block;
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@ -1,14 +1,17 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v3.5.0"
-// Dictionary doc folder to label
+const stableVersion = "v4.2.0"
+// Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v3.5.0/v3.5.1",
+    "": "v4.2.0/v4.2.1 (stable)",
+    "v4.1.1": "v4.1.0/v4.1.1",
+    "v4.0.1": "v4.0.0/v4.0.1",
+    "v3.5.1": "v3.5.0/v3.5.1",
    "v3.4.0": "v3.4.0",
    "v3.3.1": "v3.3.0/v3.3.1",
    "v3.2.0": "v3.2.0",
-    "v3.1.0": "v3.1.0 (stable)",
+    "v3.1.0": "v3.1.0",
    "v3.0.2": "v3.0.0/v3.0.1/v3.0.2",
    "v2.11.0": "v2.11.0",
    "v2.10.0": "v2.10.0",
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.rst
@ -0,0 +1,844 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+How to add a model to 🤗 Transformers?
+=======================================================================================================================
+
+Adding a new model is often difficult and requires an in-depth knowledge of the 🤗 Transformers library and ideally also
+of the model's original repository. At Hugging Face, we are trying to empower the community more and more to add models
+independently. Thus, for some new models that the community wants to be added to 🤗 Transformers, we create a customized
+*call-for-model-addition* that explains step-by-step how to add the requested model. With this
+*call-for-model-addition*, we want to teach a motivated and experienced contributor of the community how to port a
+model to 🤗 Transformers.
+
+If this sounds like something you would be interested in, feel free to check out the currently open
+“calls-for-model-addition” `here
+<https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model/open_model_proposals/README.md>`__
+and to contact us.
+
+If selected, you will then work closely with one member of the Hugging Face team to integrate the model into 🤗
+Transformers. By doing so, you will both gain a theoretical and deep practical understanding of the proposed model. But
+more importantly, you will have made a major open-source contribution to 🤗 Transformers. Along the way, you will:
+
+-  get insights into open-source best practices
+-  understand the design principles of one of the most popular NLP libraries
+-  learn how to do efficiently test large NLP models
+-  learn how to integrate Python utilities like ``black``, ``isort``, ``make fix-copies`` into a library to always
+   ensure clean and readable code
+
+We are also more than happy if you want to add a model that cannot be found in the “calls-for-model-addition” folder.
+The following sections explain in detail how to add a new model. It might also be very helpful to check out already
+added models to see if those resemble the model you would like to add `here
+<https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed>`__.
+
+To start, let's try to get a general overview of the Transformers library.
+
+General overview of 🤗 Transformers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a
+chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we
+found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗
+Transformers while keeping maintenance costs at a reasonable level.
+
+A good first starting point to better understand the library is to read the :doc:`documentation of our philosophy
+<philosophy>`. As a result of our way of working, there are some choices that we try to apply to all models:
+
+-  Composition is generally favored over-abstraction
+-  Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
+-  Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
+   have to look into the respective ``modeling_....py`` file.
+
+In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
+inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
+person that will use your model, but also everybody that will read, try to understand, and possibly tweak your code.
+
+With this in mind, let's go a bit deeper into the general library design.
+
+Overview of models
+-----------------------------------------------------------------------------------------------------------------------
+
+To successfully add a model, it is important to understand the interaction between your model and its config,
+:class:`~transformers.PreTrainedModel`, and :class:`~transformers.PretrainedConfig`. For exemplary purposes, we will
+call the model to be added to 🤗 Transformers ``BrandNewBert``.
+
+Let's take a look:
+
+.. image:: ./imgs/transformers_overview.png
+
+As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
+minimum. There are never more than two levels of abstraction for any model in the library. :obj:`BrandNewBertModel`
+inherits from :obj:`BrandNewBertPreTrainedModel` which in turn inherits from :class:`~transformres.PreTrainedModel` and
+that's it. As a general rule, we want to make sure that a new model only depends on
+:class:`~transformers.PreTrainedModel`. The important functionalities that are automatically provided to every new
+model are :meth:`~transformers.PreTrainedModel.from_pretrained` and
+:meth:`~transformers.PreTrainedModel.save_pretrained`, which are used for serialization and deserialization. All of the
+other important functionalities, such as :meth:`BrandNewBertModel.forward` should be completely defined in the new
+``modeling_brand_new_bert.py`` script. Next, we want to make sure that a model with a specific head layer, such as
+:obj:`BrandNewBertForMaskedLM` does not inherit from :obj:`BrandNewBertModel`, but rather uses :obj:`BrandNewBertModel`
+as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a
+configuration class, called :obj:`BrandNewBertConfig`. This configuration is always stored as an attribute in
+:class:`~transformers.PreTrainedModel`, and thus can be accessed via the ``config`` attribute for all classes
+inheriting from :obj:`BrandNewBertPreTrainedModel`:
+
+   .. code:: python
+
+      model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+      model.config  # model has access to its config
+
+Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
+:class:`~transformers.PretrainedConfig`. Note that the configuration and the model are always serialized into two
+different formats - the model to a `pytorch_model.bin` file and the configuration to a `config.json` file. Calling
+:meth:`~transformers.PreTrainedModel.save_pretrained` will automatically call
+:meth:`~transformers.PretrainedConfig.save_pretrained`, so that both model and configuration are saved.
+
+
+Overview of tokenizers
+-----------------------------------------------------------------------------------------------------------------------
+
+Not quite ready yet :-( This section will be added soon!
+
+Step-by-step recipe to add a model to 🤗 Transformers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries
+of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model:
+
+1. `Porting GPT2 Model <https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28>`__ by `Thomas
+   <https://huggingface.co/thomwolf>`__
+2. `Porting WMT19 MT Model <https://huggingface.co/blog/porting-fsmt>`__ by `Stas <https://huggingface.co/stas>`__
+
+From experience, we can tell you that the most important things to keep in mind when adding a model are:
+
+-  Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist
+   somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
+   from. `grep <https://www.gnu.org/software/grep/>`__ and `rg <https://github.com/BurntSushi/ripgrep>`__ are your
+   friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
+   your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
+   is based on XLM.
+-  It's more of an engineering challenge than a scientific challenge. You should spend more time on creating an
+   efficient debugging environment than trying to understand all theoretical aspects of the model in the paper.
+-  Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so that we at Hugging Face are more
+   than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
+   progress.
+
+In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers.
+
+The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do
+List:
+
+-  1. ☐ (Optional) Understood theoretical aspects
+-  2. ☐ Prepared transformers dev environment
+-  3. ☐ Set up debugging environment of the original repository
+-  4. ☐ Created script that successfully runs forward pass using original repository and checkpoint
+-  5. ☐ Successfully added the model skeleton to Transformers
+-  6. ☐ Successfully converted original checkpoint to Transformers checkpoint
+-  7. ☐ Successfully ran forward pass in Transformers that gives identical output to original checkpoint
+-  8. ☐ Finished model tests in Transformers
+-  9. ☐ Successfully added Tokenizer in Transformers
+-  10. ☐ Run end-to-end integration tests
+-  11. ☐ Finished docs
+-  12. ☐ Uploaded model weights to the hub
+-  13. ☐ Submitted the pull request
+-  14. ☐ (Optional) Added a demo notebook
+
+To begin with, we usually recommend to start by getting a good theoretical understanding of ``BrandNewBert``. However,
+if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
+into the ``BrandNewBert``'s code-base. This option might suit you better, if your engineering skills are better than
+your theoretical skill, if you have trouble understanding ``BrandNewBert``'s paper, or if you just enjoy programming
+much more than reading scientific papers.
+
+1. (Optional) Theoretical aspects of BrandNewBert
+-----------------------------------------------------------------------------------------------------------------------
+
+You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
+sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
+not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
+effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the
+theoretical aspects, but rather focus on the practical ones, namely:
+
+-  What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like
+   encoder-decoder model? Look at the :doc:`model_summary` if you're not familiar with the differences between those.
+-  What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
+   summarization?
+-  What is the novel feature of the model making it different from BERT/GPT-2/BART?
+-  Which of the already existing `🤗 Transformers models <https://huggingface.co/transformers/#contents>`__ is most
+   similar to *brand_new_bert*?
+-  What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
+   for BERT or BART?
+
+After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the
+Hugging Face team with any questions you might have. This might include questions regarding the model's architecture,
+its attention layer, etc. We will be more than happy to help you.
+
+2. Next prepare your environment
+-----------------------------------------------------------------------------------------------------------------------
+
+1. Fork the `repository <https://github.com/huggingface/transformers>`__ by clicking on the ‘Fork' button on the
+   repository's page. This creates a copy of the code under your GitHub user account.
+
+2. Clone your ``transformers`` fork to your local disk, and add the base repository as a remote:
+
+   .. code:: bash
+
+      git clone https://github.com/[your Github handle]/transformers.git
+      cd transformers
+      git remote add upstream https://github.com/huggingface/transformers.git
+
+3. Set up a development environment, for instance by running the following command:
+
+   .. code:: bash
+
+      python -m venv .env
+      source .env/bin/activate
+      pip install -e ".[dev]"
+
+and return to the parent directory
+
+.. code:: bash
+
+   cd ..
+
+4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the
+   instructions on https://pytorch.org/get-started/locally/.
+
+**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
+
+5. To port *brand_new_bert*, you will also need access to its original repository:
+
+.. code:: bash
+
+   git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
+   cd brand_new_bert
+   pip install -e .
+
+Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers.
+
+3.-4. Run a pretrained checkpoint using the original repository
+-----------------------------------------------------------------------------------------------------------------------
+
+At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very
+“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should
+be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people
+stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make
+it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement
+models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**.
+
+You should start thereby by diving into the original repository.
+
+Successfully running the official pretrained model in the original repository is often **the most difficult** step.
+From our experience, it is very important to spend some time getting familiar with the original code-base. You need to
+figure out the following:
+
+-  Where to find the pretrained weights?
+-  How to load the pretrained weights into the corresponding model?
+-  How to run the tokenizer independently from the model?
+-  Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
+   you only have to reimplement those functions.
+-  Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
+   *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
+   *e.g.* *self-attention*, *cross-attention*...?
+-  How can you debug the model in the original environment of the repo? Do you have to add `print` statements, can you
+   work with an interactive debugger like `ipdb`, or should you use an efficient IDE to debug the model, like PyCharm?
+
+It is very important that before you start the porting process, that you can **efficiently** debug code in the original
+repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
+even a pull request in the original repository. The maintainers of this repository are most likely very happy about
+someone looking into their code!
+
+At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original
+model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to
+dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only
+at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the
+model also works as expected on GPU.
+
+In general, there are two possible debugging environments for running the original model
+
+-  `Jupyter notebooks <https://jupyter.org/>`__ / `google colab
+   <https://colab.research.google.com/notebooks/intro.ipynb>`__
+-  Local python scripts.
+
+Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
+logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also,
+notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
+Face team for help. If you are familiar with Jupiter notebooks, we strongly recommend you to work with them.
+
+The obvious disadvantage of Jupyther notebooks is that if you are not used to working with them you will have to spend
+some time adjusting to the new programming environment and that you might not be able to use your known debugging tools
+anymore, like ``ipdb``.
+
+For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
+single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in
+pseudocode):
+
+.. code:: bash
+
+   model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
+   input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+   original_output = model.predict(input_ids)
+
+Next, regarding the debugging strategy, there are generally a few from which to choose from:
+
+-  Decompose the original model into many small testable components and run a forward pass on each of those for
+   verification
+-  Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
+   those, and use intermediate print statements or breakpoints for verification
+
+Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code
+base.
+
+If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original
+code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages
+to taking the more difficult road in the beginning:
+
+- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically
+  for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead
+  of relying on visual comparison via print statements
+- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting
+  individual components and thus structure your work better
+- separating the model into logical meaningful components will help you to get a better overview of the model's design
+  and thus to better understand the model
+- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue
+  changing your code
+
+`Lysandre's <https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed>`__ integration checks for ELECTRA
+gives a nice example of how this can be done.
+
+However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode,
+it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good
+example is `T5's MeshTensorFlow <https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow>`__ library which is
+very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
+often relies on verifying print statements.
+
+No matter which strategy you choose, the recommended procedure is often the same in that you should start to debug the
+starting layers first and the ending layers last.
+
+It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
+layers in the following order:
+
+1.  Retrieve the input IDs passed to the model
+2.  Retrieve the word embeddings
+3.  Retrieve the input of the first Transformer layer
+4.  Retrieve the output of the first Transformer layer
+5.  Retrieve the output of the following n - 1 Transformer layers
+6.  Retrieve the output of the whole BrandNewBert Model
+
+Input IDs should thereby consists of an array of integers, *e.g.* ``input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]``
+
+The outputs of the following layers often consist of multi-dimensional float arrays and can look like this:
+
+.. code:: bash
+
+   [[
+    [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+    [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+    [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+    ...,
+    [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+    [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+    [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+
+We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original
+model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001!
+Since it is normal that the exact same model written in different libraries can give a slightly different output
+depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives
+nearly the same output, they have to be the almost identical. Therefore, you will certainly compare the intermediate
+outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of
+*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
+important. Here is some advice is to make your debugging environment as efficient as possible.
+
+-  Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
+   probably take the time to write a longer script that decomposes the original model into smaller sub-components to
+   retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
+   TensorFlow print operations like `tf.print <https://www.tensorflow.org/api_docs/python/tf/print>`__ to output
+   intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
+   running the forward pass, *e.g.* check-out `this link <https://github.com/google/jax/issues/196>`__.
+-  Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
+   becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
+   In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
+   environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
+   of your model
+-  Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
+   find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
+   ``predict``, ``evaluate``, ``forward`` or ``__call__``. You don't want to debug a function that calls ``forward``
+   multiple times, *e.g.* to generate text, like ``autoregressive_sample``, ``generate``.
+-  Try to separate the tokenization from the model's `forward` pass. If the original repository shows examples where
+   you have to input a string, then try to find out where in the forward call the string input is changed to input ids
+   and start from this point. This might mean that you have to possibly write a small script yourself or change the
+   original code so that you can directly input the ids instead of an input string.
+-  Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
+   random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
+   environment is **deterministic** so that the dropout layers are not used. Or use `transformers.file_utils.set_seed`
+   if the old and new implementations are in the same framework.
+
+The following section gives you more specific details/tips on how you can do this for *brand_new_bert*.
+
+5.-14. Port BrandNewBert to 🤗 Transformers
+-----------------------------------------------------------------------------------------------------------------------
+
+Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork:
+
+::
+
+   cd transformers
+
+In the special case that you are adding a model whose architecture exactly matches the model architecture of an
+existing model you only have to add a conversion script as described in `this section <#write-a-conversion-script>`__.
+In this case, you can just re-use the whole model architecture of the already existing model.
+
+Otherwise, let's start generating a new model with the amazing Cookiecutter!
+
+**Use the Cookiecutter to automatically generate the model's code**
+
+To begin with head over to the `🤗 Transformers templates
+<https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model>`__ to make use of our
+``cookiecutter`` implementation to automatically generate all the relevant files for your model. Again, we recommend
+only adding the PyTorch version of the model at first. Make sure you follow the instructions of the ``README.md`` on
+the `🤗 Transformers templates <https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model>`__
+carefully.
+
+**Open a Pull Request on the main huggingface/transformers repo**
+
+Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull
+request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work
+side-by-side on integrating the model into 🤗 Transformers.
+
+You should do the following:
+
+1. Create a branch with a descriptive name from your master branch
+
+::
+
+   git checkout -b add_brand_new_bert
+
+2. Commit the automatically generated code:
+
+::
+
+   git add .
+   git commit
+
+3. Fetch and rebase to current master
+
+::
+
+   git fetch upstream
+   git rebase upstream/master
+
+4. Push the changes to your account using:
+
+::
+
+   git push -u origin a-descriptive-name-for-my-changes
+
+5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
+   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
+   future changes.
+
+6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
+
+In the following, whenever you have done some progress, don't forget to commit your work and push it to your account so
+that it shows in the pull request. Additionally, you should make sure to update your work with the current master from
+time to time by doing:
+
+::
+
+   git fetch upstream
+   git merge upstream/master
+
+In general, all questions you might have regarding the model or your implementation should be asked in your PR and
+discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
+if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging
+Face team can efficiently understand your problem or question.
+
+To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you
+want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved,
+you can click on the “Resolve” button of the created comment.
+
+In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions
+on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the
+Hugging Face team by Slack or email.
+
+**5. Adapt the generated models code for brand_new_bert**
+
+At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be
+found in the generated files ``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` and
+``src/transformers/models/brand_new_bert/configuration_brand_new_bert.py``.
+
+Now you can finally start coding :). The generated code in
+``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` will either have the same architecture as BERT if
+it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
+you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
+BART?*". Implement those changes which often means to change the *self-attention* layer, the order of the normalization
+layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to
+get a better feeling of how your model should be implemented.
+
+**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is
+advised to add a first *unclean*, copy-pasted version of the original code to
+``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` until you feel like all the necessary code is
+added. From our experience, it is much more efficient to quickly add a first version of the required code and
+improve/correct the code iteratively with the conversion script as described in the next section. The only thing that
+has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the
+following command should work:
+
+.. code:: python
+
+   from transformers import BrandNewBertModel, BrandNewBertConfig
+   model = BrandNewBertModel(BrandNewBertConfig())
+
+The above command will create a model according to the default parameters as defined in ``BrandNewBertConfig()`` with
+random weights, thus making sure that the ``init()`` methods of all components works.
+
+**6. Write a conversion script**
+
+Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in
+the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of
+*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already
+existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in
+the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and
+slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already
+existing conversion script for your model.
+
+-  If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script `here
+   <https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91>`__
+-  If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script `here
+   <https://github.com/huggingface/transformers/blob/master/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py>`__
+
+In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the
+name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in
+PyTorch, called ``SimpleModel`` as follows:
+
+.. code:: python
+
+   import torch.nn as nn
+
+   class SimpleModel(nn.Module):
+       def __init__(self):
+               super().__init__()
+               self.dense = nn.Linear(10, 10)
+               self.intermediate = nn.Linear(10, 10)
+               self.layer_norm = nn.LayerNorm(10)
+
+Now we can create an instance of this model definition which will fill all weights: ``dense``, ``intermediate``,
+``layer_norm`` with random weights. We can print the model to see its architecture
+
+.. code:: python
+
+   model = SimpleModel()
+
+   print(model)
+
+This will print out the following:
+
+.. code:: bash
+
+   SimpleModel(
+     (dense): Linear(in_features=10, out_features=10, bias=True)
+     (intermediate): Linear(in_features=10, out_features=10, bias=True)
+     (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+   )
+
+We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight
+values of a specific layer:
+
+.. code:: python
+
+   print(model.dense.weight.data)
+
+to see that the weights were randomly initialized
+
+.. code:: bash
+
+   tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+            -0.2077,  0.2157],
+           [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+             0.2166, -0.0212],
+           [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+            -0.1023, -0.0447],
+           [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+            -0.1876, -0.2467],
+           [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+             0.2577,  0.0402],
+           [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+             0.2132,  0.1680],
+           [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+             0.2707, -0.2509],
+           [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+             0.1829, -0.1568],
+           [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+             0.0333, -0.0536],
+           [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+             0.2220,  0.2358]]).
+
+In the conversion script, you should fill those randomly initialized weights with the exact weights of the
+corresponding layer in the checkpoint. *E.g.*
+
+.. code:: python
+
+   # retrieve matching layer weights, e.g. by 
+   # recursive algorithm
+   layer_name = "dense"
+   pretrained_weight = array_of_dense_layer
+
+   model_pointer = getattr(model, "dense")
+
+   model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+
+While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding
+pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert
+statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like:
+
+.. code:: python
+
+   assert (
+        model_pointer.weight.shape == pretrained_weight.shape
+   ), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+
+Besides, you should also print out the names of both weights to make sure they match, *e.g.*
+
+.. code:: python
+
+   logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+
+If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly
+initialized layer of the 🤗 Transformers implementation.
+
+An incorrect shape is most likely due to an incorrect setting of the config parameters in ``BrandNewBertConfig()`` that
+do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that
+PyTorch's implementation of a layer requires the weight to be transposed beforehand.
+
+Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
+were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
+conversion trials fail with either a wrong shape statement or wrong name assignment. This is most likely because either
+you used incorrect parameters in ``BrandNewBertConfig()``, have a wrong architecture in the 🤗 Transformers
+implementation, you have a bug in the ``init()`` functions of one of the components of the 🤗 Transformers
+implementation or you need to transpose one of the checkpoint weights.
+
+This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the
+Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save
+the model under a folder of your choice ``/path/to/converted/checkpoint/folder`` that should then contain both a
+``pytorch_model.bin`` file and a ``config.json`` file:
+
+.. code:: python
+
+   model.save_pretrained("/path/to/converted/checkpoint/folder")
+
+**7. Implement the forward pass**
+
+Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
+sure that the forward pass is correctly implemented. In `Get familiar with the original repository
+<#run-a-pretrained-checkpoint-using-the-original-repository>`__, you have already created a script that runs a forward
+pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
+implementation instead of the original one. It should look as follows:
+
+.. code:: python
+
+   model = BrandNewBertModel.from_pretrained(/path/to/converted/checkpoint/folder)
+   input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+   output = model(input_ids).last_hidden_states
+
+It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact
+same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First,
+you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are
+used leading to a `Dimensionality mismatch` error or that the wrong data type object is used, *e.g.* ``torch.long``
+instead of ``torch.float32``. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
+certain errors.
+
+The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are
+equivalent to a precision of ``1e-3``. First, you should ensure that the output shapes are identical, *i.e.*
+``outputs.shape`` should yield the same value for the script of the 🤗 Transformers implementation and the original
+implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult
+parts of adding a new model. Common mistakes why the outputs are not identical are:
+
+-  Some layers were not added, *i.e.* an `activation` layer was not added, or the residual connection was forgotten
+-  The word embedding matrix was not tied
+-  The wrong positional embeddings are used because the original implementation uses on offset
+-  Dropout is applied during the forward pass. To fix this make sure `model.training is False` and that no dropout
+   layer is falsely activated during the forward pass, *i.e.* pass `self.training` to `PyTorch's functional dropout
+   <https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout>`_
+
+The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗
+Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out
+intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗
+Transformers implementation shows a different output than the original implementation. First, make sure that the
+hard-coded ``input_ids`` in both scripts are identical. Next, verify that the outputs of the first transformation of
+the ``input_ids`` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
+network. At some point, you will notice a difference between the two implementations, which should point you to the bug
+in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
+in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
+respectively, and to successively remove print statements showing the same values for intermediate presentions.
+
+When you're confident that both implementations yield the same output, verifying the outputs with
+``torch.allclose(original_output, output, atol=1e-3)``, you're done with the most difficult part! Congratulations - the
+work left to be done should be a cakewalk 😊.
+
+**8. Adding all necessary model tests**
+
+At this point, you have successfully added a new model. However, it is very much possible that the model does not yet
+fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all
+common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
+the same ``tests/test_modeling_brand_new_bert.py``. Run this test file to verify that all common tests pass:
+
+.. code:: python
+
+   pytest tests/test_modeling_brand_new_bert.py
+
+Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that
+
+-  
+
+   a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
+
+-  
+
+   b) Future changes to your model will not break any important feature of the model.
+
+At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
+you used earlier to implement the model to 🤗 Transformers. A template of those model tests is already added by the
+Cookiecutter, called ``BrandNewBertModelIntegrationTests`` and only has to be filled out by you. To ensure that those
+tests are passing, run
+
+.. code:: python
+
+   RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+
+.. note::
+
+  In case you are using Windows, you should replace ``RUN_SLOW=1`` with ``SET RUN_SLOW=1``
+
+Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under
+``BrandNewBertModelTester``/``BrandNewBertModelTest``. This part is often forgotten but is extremely useful in two
+ways:
+
+-  It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
+   special features of *brand_new_bert* should work.
+-  Future contributors can quickly test changes to the model by running those special tests.
+
+
+**9. Implement the tokenizer**
+
+Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent or very similar to an
+already existing tokenizer of 🤗 Transformers.
+
+It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗
+Transformers' implementation of the tokenizer.
+
+To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository
+that inputs a string and returns the ``input_ids``. It could look similar to this (in pseudo-code):
+
+.. code:: bash
+
+   input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+   model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
+   input_ids = model.tokenize(input_str)
+
+You might have to take a deeper look again into the original repository to find the correct tokenizer function or you
+might even have to do changes to your clone of the original repository to only output the ``input_ids``. Having written
+a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be
+created. It should look similar to this:
+
+.. code:: python
+
+   from transformers import BrandNewBertTokenizer
+   input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+
+   tokenizer = BrandNewBertTokenizer.from_pretrained(/path/to/tokenizer/folder/)
+
+   input_ids = tokenizer(input_str).input_ids
+
+When both ``input_ids`` yield the same values, as a final step a tokenizer test file should also be added.
+
+Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should
+contain a couple of hard-coded integration tests.
+
+**10. Run End-to-end integration tests**
+
+Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the
+tokenizer to ``tests/test_modeling_brand_new_bert.py`` in 🤗 Transformers. Such a test should show on a meaningful
+text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can
+include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none
+of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a
+final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can
+happen that you forgot to add some ``.to(self.device)`` statements to internal tensors of the model, which in such a
+test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those
+tests for you.
+
+**11. Add Docstring**
+
+Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
+a nice docstring and a doc page. The Cookiecutter should have added a template file called
+``docs/source/model_doc/brand_new_bert.rst`` that you should fill out. Users of your model will usually first look at
+this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
+the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
+regarding the docstrings.
+
+Next, make sure that the docstring added to ``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` is
+correct and included all necessary inputs and outputs. It is always to good to remind oneself that documentation should
+be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact
+point of the community with the model.
+
+**Code refactor**
+
+Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential
+incorrect code style by running:
+
+.. code:: bash
+
+   make style
+
+and verify that your coding style passes the quality check:
+
+.. code:: bash
+
+   make quality
+
+There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in
+the tests of your pull request. This is often because of some missing information in the docstring or some incorrect
+naming. The Hugging Face team will surely help you if you're stuck here.
+
+Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all
+tests passing, now it's a good time to go over the added code again and do some refactoring.
+
+You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎
+
+**12. Upload the models to the model hub**
+
+In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each
+uploaded model checkpoint. You should work alongside the Hugging Face team here to decide on a fitting name for each
+checkpoint and to get the required access rights to be able to upload the model under the author's organization of
+*brand_new_bert*.
+
+It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the
+specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint
+pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to
+correctly use the model.
+
+**13. (Optional) Add notebook**
+
+It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or
+fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community.
+
+**14. Submit your finished PR**
+
+You're done programming now and can move to the last step, which is getting your PR merged into master. Usually, the
+Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished
+PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your
+reviewer.
+
+Share your work!!
+-----------------------------------------------------------------------------------------------------------------------
+
+Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
+contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
+used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share
+your achievement with the community.
+
+**You have made another model that is super easy to access for everyone in the community! 🤯**
--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@ -1,10 +1,22 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Benchmarks
 =======================================================================================================================

 Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.

-A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found `here
-<https://github.com/huggingface/transformers/blob/master/notebooks/05-benchmark.ipynb>`__.
+A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found :prefix_link:`here
+<notebooks/05-benchmark.ipynb>`.

 How to benchmark 🤗 Transformer models
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -87,6 +99,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    --------------------------------------------------------------------------------

    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: PyTorch
    - use_torchscript: False
@ -133,6 +146,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    --------------------------------------------------------------------------------

    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: Tensorflow
    - use_xla: False
@ -216,6 +230,7 @@ configurations must be inserted with the benchmark args as follows.
    --------------------------------------------------------------------------------

    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: PyTorch
    - use_torchscript: False
@ -285,6 +300,7 @@ configurations must be inserted with the benchmark args as follows.
    --------------------------------------------------------------------------------

    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: Tensorflow
    - use_xla: False
@ -341,5 +357,5 @@ The approach is detailed in the `following blogpost
 available `here
 <https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing>`__.

-With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community `here
-<https://github.com/huggingface/transformers/blob/master/examples/benchmarking/README.md>`__.
+With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community
+:prefix_link:`here <examples/benchmarking/README.md>`.
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BERTology
 -----------------------------------------------------------------------------------------------------------------------

@ -21,6 +33,6 @@ help people access the inner representations, mainly adapted from the great work
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
  in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: `bertology.py
-<https://github.com/huggingface/transformers/blob/master/examples/bertology/run_bertology.py>`_ while extract
-information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: :prefix_link:`bertology.py
+<examples/research_projects/bertology/run_bertology.py>` while extract information and prune a model pre-trained on
+GLUE.
--- a/docs/source/community.md
+++ b/docs/source/community.md
@ -0,0 +1,49 @@
+# Community
+
+This page regroups resources around 🤗 Transformers developed by the community.
+
+## Community resources:
+
+| Resource     |      Description      |      Author      |
+|:----------|:-------------|------:|
+| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](https://huggingface.co/transformers/master/glossary.html) that has been put into a form which can be easily learnt/revised using [Anki ](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
+
+## Community notebooks:
+
+| Notebook     |      Description      |      Author      |      |
+|:----------|:-------------|:-------------|------:|
+| [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | How to train on sequences as long as 500,000 tokens with Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
+| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) |
+| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb)  | How to generate tweets in the style of your favorite Twitter account by fine-tune a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [A Step by Step Guide to Tracking Hugging Face Model Performance](https://colab.research.google.com/drive/1NEiqNPhiouu2pPwDAVeFoN4-vTYMz9F8)  | A quick tutorial for training NLP models with HuggingFace and & visualizing their performance with Weights & Biases |  [Jack Morris](https://github.com/jxmorris12) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1NEiqNPhiouu2pPwDAVeFoN4-vTYMz9F8) |
+| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | How to build a "long" version of existing pretrained models |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
+|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune an Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -20,14 +20,17 @@ sys.path.insert(0, os.path.abspath('../../src'))
 # -- Project information -----------------------------------------------------

 project = u'transformers'
-copyright = u'2020, huggingface'
+copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
 author = u'huggingface'

 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'3.5.0'
-
+release = u'4.3.0'
+# Prefix link to point to master, comment this during version release and uncomment below line
+extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
+# Prefix link to always point to corresponding version, uncomment this during version release
+# extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/v'+ release + '/%s', '')}

 # -- General configuration ---------------------------------------------------

@ -40,6 +43,7 @@ release = u'3.5.0'
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
+    'sphinx.ext.extlinks',
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
    'recommonmark',
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Converting Tensorflow Checkpoints
 =======================================================================================================================

@ -15,19 +27,14 @@ BERT

 You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
 <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the
-`convert_bert_original_tf_checkpoint_to_pytorch.py
-<https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_
-script.
+:prefix_link:`convert_bert_original_tf_checkpoint_to_pytorch.py
+<src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.

 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated
 configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights
 from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that
-can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py
-<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ ,
-`run_bert_classifier.py
-<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and
-`run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\
-).
+can be imported using ``from_pretrained()`` (see example in :doc:`quicktour` , `run_glue.py
+<https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_glue.py>`_\ ).

 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
 checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\
@ -54,9 +61,8 @@ ALBERT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
-`convert_albert_original_tf_checkpoint_to_pytorch.py
-<https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_
-script.
+:prefix_link:`convert_albert_original_tf_checkpoint_to_pytorch.py
+<src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py>` script.

 The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying
 configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you
@ -158,3 +164,18 @@ Here is an example of the conversion process for a pre-trained XLM model:
     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
    [--config XML_CONFIG] \
    [--finetuning_task_name XML_FINETUNED_TASK]
+
+
+T5
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Here is an example of the conversion process for a pre-trained T5 model:
+
+.. code-block:: shell
+
+   export T5=/path/to/t5/uncased_L-12_H-768_A-12
+
+   transformers-cli convert --model_type t5 \
+     --tf_checkpoint $T5/t5_model.ckpt \
+     --config $T5/t5_config.json \
+     --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Fine-tuning with custom datasets
 =======================================================================================================================

@ -63,7 +75,7 @@ read this in.
    test_texts, test_labels = read_imdb_split('aclImdb/test')

 We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation
-and tuning without training our test set results. Sklearn has a convenient utility for creating such splits:
+and tuning without tainting our test set results. Sklearn has a convenient utility for creating such splits:

 .. code-block:: python

@ -546,12 +558,15 @@ we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method
        end_positions = []
        for i in range(len(answers)):
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
-            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
-            # if None, the answer passage has been truncated
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
+
+            # if start position is None, the answer passage has been truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
+
+            # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
            if end_positions[-1] is None:
-                end_positions[-1] = tokenizer.model_max_length
+                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1)
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

    add_token_positions(train_encodings, train_answers)
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Glossary
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

@ -12,11 +24,11 @@ General terms
 - MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
  by masking some tokens randomly, and has to predict the original text.
 - multimodal: a task that combines texts with another kind of inputs (for instance images).
- NLG: natural language generation, all tasks related to generating text ( for instance talk with transformers,
-  translation)
+- NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
+  translation).
 - NLP: natural language processing, a generic way to say "deal with texts".
 - NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
-  the whole text, individual words)
+  the whole text, individual words).
 - pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
  masking some words and trying to predict them (see MLM).
@ -214,7 +226,7 @@ Contrary to RNNs that have the position of each token embedded within them, tran
 each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in
 the list of tokens.

-They are an optional parameter. If no ``position_ids`` is passed to the model, the IDs are automatically created as
+They are an optional parameter. If no ``position_ids`` are passed to the model, the IDs are automatically created as
 absolute positional embeddings.

 Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models use
--- a/docs/source/imgs/transformers_overview.png
+++ b/docs/source/imgs/transformers_overview.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -22,6 +22,18 @@ State-of-the-art NLP for everyone:
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators

+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Lower compute costs, smaller carbon footprint:

 - Researchers can share trained models instead of always retraining
@ -35,6 +47,16 @@ Choose the right framework for every part of a model's lifetime:
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production

+Experimental support for Flax with a few models right now, expected to grow in the coming months.
+
+`All the model checkpoints <https://huggingface.co/models>`__ are seamlessly integrated from the huggingface.co `model
+hub <https://huggingface.co>`__ where they are uploaded directly by `users <https://huggingface.co/users>`__ and
+`organizations <https://huggingface.co/organizations>`__.
+
+Current number of checkpoints: |checkpoints|
+
+.. |checkpoints| image:: https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen
+
 Contents
 -----------------------------------------------------------------------------------------------------------------------

@ -44,7 +66,7 @@ The documentation is organized in five parts:
  and a glossary.
 - **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
 - **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
+- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in
  transformers model
 - The three last section contain the documentation of each public class and function, grouped in:

@ -52,8 +74,8 @@ The documentation is organized in five parts:
    - **MODELS** for the classes and functions related to each model implemented in the library.
    - **INTERNAL HELPERS** for the classes and functions we use internally.

-The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
-conversion utilities for the following models:
+The library currently contains PyTorch, Tensorflow and Flax implementations, pretrained model weights, usage scripts
+and conversion utilities for the following models:

 ..
    This list is updated automatically from the README with `make fix-copies`. Do not update manually!
@ -66,105 +88,225 @@ conversion utilities for the following models:
   Pre-training for Natural Language Generation, Translation, and Comprehension
   <https://arxiv.org/pdf/1910.13461.pdf>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
   Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-3. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
+   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
+   Tixier, Michalis Vazirgiannis.
+4. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
   Kenton Lee and Kristina Toutanova.
-4. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
+5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
   Narayan, Aliaksei Severyn.
-5. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+6. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-6. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+7. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
+   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
+   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+8. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
+   <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
+9. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
   French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
   Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-7. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
-   Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
-   Lav R. Varshney, Caiming Xiong and Richard Socher.
-8. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
-   BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
-   Weizhu Chen.
-9. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
-   Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe Zhang,
-   Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-10. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+10. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
+    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+11. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
+    Lav R. Varshney, Caiming Xiong and Richard Socher.
+12. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
+    BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
+    Weizhu Chen.
+13. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
+    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+14. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-11. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+15. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-12. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+16. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-13. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+17. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-14. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+18. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-15. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+19. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-16. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+20. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-17. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+21. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-18. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+22. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+23. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-19. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+24. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-20. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+25. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-21. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+26. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-22. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+27. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
+    Jianfeng Lu, Tie-Yan Liu.
+28. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-23. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+29. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-24. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+30. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-25. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+31. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-26. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+32. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
-    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT
-    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German version of
-    DistilBERT.
-27. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+33. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-28. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+34. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-29. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+35. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
+    Francesco Piccinno and Julian Martin Eisenschlos.
+36. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-30. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+37. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
+    Zhou, Abdelrahman Mohamed, Michael Auli.
+38. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-31. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+39. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-32. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+40. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-33. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+41. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-34. `Other community models <https://huggingface.co/models>`__, contributed by the `community
-    <https://huggingface.co/users>`__.
+
+
+.. _bigtable:
+
+The table below represents the current support in the library for each of those models, whether they have a Python
+tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in PyTorch,
+TensorFlow and/or Flax.
+
+..
+    This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
+
+.. rst-class:: center-aligned-table
+
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+=============================+================+================+=================+====================+==============+
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           DeBERTa           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             RAG             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

 .. toctree::
    :maxdepth: 2
@ -195,9 +337,11 @@ conversion utilities for the following models:
    examples
    custom_datasets
    notebooks
+    community
    converting_tensorflow_models
    migration
    contributing
+    add_new_model
    testing
    serialization

@ -231,10 +375,15 @@ conversion utilities for the following models:
    model_doc/albert
    model_doc/auto
    model_doc/bart
+    model_doc/barthez
    model_doc/bert
+    model_doc/bertweet
    model_doc/bertgeneration
    model_doc/blenderbot
+    model_doc/blenderbot_small
+    model_doc/bort
    model_doc/camembert
+    model_doc/convbert
    model_doc/ctrl
    model_doc/deberta
    model_doc/dialogpt
@ -245,16 +394,20 @@ conversion utilities for the following models:
    model_doc/flaubert
    model_doc/fsmt
    model_doc/funnel
+    model_doc/herbert
    model_doc/layoutlm
+    model_doc/led
    model_doc/longformer
    model_doc/lxmert
    model_doc/marian
    model_doc/mbart
    model_doc/mobilebert
+    model_doc/mpnet
    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
    model_doc/pegasus
+    model_doc/phobert
    model_doc/prophetnet
    model_doc/rag
    model_doc/reformer
@ -262,7 +415,9 @@ conversion utilities for the following models:
    model_doc/roberta
    model_doc/squeezebert
    model_doc/t5
+    model_doc/tapas
    model_doc/transformerxl
+    model_doc/wav2vec2
    model_doc/xlm
    model_doc/xlmprophetnet
    model_doc/xlmroberta
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,9 +1,25 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Installation

 🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
-unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going 
+unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
 to use and activate it.

 Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you
@ -12,9 +28,10 @@ must install it from source.
 ## Installation with pip

 First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) 
-and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific 
-install command for your platform.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available),
+[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or
+[Flax installation page](https://github.com/google/flax#quick-install)
+regarding the specific install command for your platform.

 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:

@ -34,6 +51,12 @@ or 🤗 Transformers and TensorFlow 2.0 in one line with:
 pip install transformers[tf-cpu]
 ```

+or 🤗 Transformers and Flax in one line with:
+
+```bash
+pip install transformers[flax]
+```
+
 To check 🤗 Transformers is properly installed, run the following command:

 ```bash
@ -50,15 +73,17 @@ It should download a pretrained model then print something like

 ## Installing from source

-To install from source, clone the repository and install with the following commands:
+Here is how to quickly install `transformers` from source:

-``` bash
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
+```bash
+pip install git+https://github.com/huggingface/transformers
 ```

-Again, you can run 
+Note that this will install not the latest released version, but the bleeding edge `master` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't  been yet rolled out.
+
+While we strive to keep `master` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/transformers/issues) and this way, things will get fixed even sooner.
+
+Again, you can run:

 ```bash
 python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
@ -66,19 +91,59 @@ python -c "from transformers import pipeline; print(pipeline('sentiment-analysis

 to check 🤗 Transformers is properly installed.

+## Editable install
+
+If you want to constantly use the bleeding edge `master` version of the source code, or if you want to contribute to the library and need to test the changes in the code you're making, you will need an editable install. This is done by cloning the repository and installing with the following commands:
+
+``` bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+This command performs a magical link between the folder you cloned the repository to and your python library paths, and it'll look inside this folder in addition to the normal library-wide paths. So if normally your python packages get installed into:
+```
+~/anaconda3/envs/main/lib/python3.7/site-packages/
+```
+now this editable install will reside where you clone the folder to, e.g. `~/transformers/` and python will search it too.
+
+Do note that you have to keep that `transformers` folder around and not delete it to continue using the  `transfomers` library.
+
+Now, let's get to the real benefit of this installation approach. Say, you saw some new feature has been just committed into `master`. If you have already performed all the steps above, to update your transformers to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
+
+```
+cd ~/transformers/
+git pull
+```
+
+There is nothing else to do. Your python environment will find the bleeding edge version of `transformers` on the next run.
+
+
+## With conda
+
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```
+conda install -c huggingface transformers
+```
+
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
+
 ## Caching models

 This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
 `cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
-folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the PyTorch
-cache home followed by ``/transformers/`` (even if you don't have PyTorch installed). This is (by order of priority):
+folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
+Face cache home followed by ``/transformers/``. This is (by order of priority):

-  * shell environment variable ``TORCH_HOME``
-  * shell environment variable ``XDG_CACHE_HOME`` + ``/torch/``
-  * default: ``~/.cache/torch/``
+  * shell environment variable ``HF_HOME``
+  * shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
+  * default: ``~/.cache/huggingface/``

 So if you don't have any specific environment variable set, the cache directory will be at
-``~/.cache/torch/transformers/``.
+``~/.cache/huggingface/transformers/``.

 **Note:** If you have set a shell environment variable for one of the predecessors of this library
 (``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
@ -94,9 +159,9 @@ faster, and cheaper. Feel free to contact us privately if you need any help.

 You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.

-It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, 
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
 `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.

-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch or
+At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
 TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
 hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@ -1,12 +1,114 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for Generation
 -----------------------------------------------------------------------------------------------------------------------

-This page lists all the utility functions used by :meth:`~transformers.PretrainedModel.generate`,
-:meth:`~transformers.PretrainedModel.greedy_search`, :meth:`~transformers.PretrainedModel.sample`,
-:meth:`~transformers.PretrainedModel.beam_search`, and :meth:`~transformers.PretrainedModel.beam_sample`.
+This page lists all the utility functions used by :meth:`~transformers.PreTrainedModel.generate`,
+:meth:`~transformers.PreTrainedModel.greedy_search`, :meth:`~transformers.PreTrainedModel.sample`,
+:meth:`~transformers.PreTrainedModel.beam_search`, :meth:`~transformers.PreTrainedModel.beam_sample`, and
+:meth:`~transformers.PreTrainedModel.group_beam_search`.

 Most of those are only useful if you are studying the code of the generate methods in the library.

+Generate Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The output of :meth:`~transformers.PreTrainedModel.generate` is an instance of a subclass of
+:class:`~transformers.file_utils.ModelOutput`. This output is a data structure containing all the information returned
+by :meth:`~transformers.PreTrainedModel.generate`, but that can also be used as tuple or dictionary.
+
+Here's an example:
+
+.. code-block::
+
+    from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+    inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
+    generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+
+The ``generation_output`` object is a :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, as we can
+see in the documentation of that class below, it means it has the following attributes:
+
+- ``sequences``: the generated sequences of tokens
+- ``scores`` (optional): the prediction scores of the language modelling head, for each generation step
+- ``hidden_states`` (optional): the hidden states of the model, for each generation step
+- ``attentions`` (optional): the attention weights of the model, for each generation step
+
+Here we have the ``scores`` since we passed along ``output_scores=True``, but we don't have ``hidden_states`` and
+``attentions`` because we didn't pass ``output_hidden_states=True`` or ``output_attentions=True``.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get ``None``. Here for instance ``generation_output.scores`` are all the generated prediction scores of the
+language modeling head, and ``generation_output.attentions`` is ``None``.
+
+When using our ``generation_output`` object as a tuple, it only keeps the attributes that don't have ``None`` values.
+Here, for instance, it has two elements, ``loss`` then ``logits``, so
+
+.. code-block::
+
+    generation_output[:2]
+
+will return the tuple ``(generation_output.sequences, generation_output.scores)`` for instance.
+
+When using our ``generation_output`` object as a dictionary, it only keeps the attributes that don't have ``None``
+values. Here, for instance, it has two keys that are ``sequences`` and ``scores``.
+
+We document here all output types.
+
+
+GreedySearchOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.GreedySearchDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.GreedySearchEncoderDecoderOutput
+    :members:
+
+
+SampleOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.SampleDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.SampleEncoderDecoderOutput
+    :members:
+
+
+BeamSearchOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.BeamSearchDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.BeamSearchEncoderDecoderOutput
+    :members:
+
+
+BeamSampleOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.BeamSampleDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.BeamSampleEncoderDecoderOutput
+    :members:
+
+
 LogitsProcessor
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -19,6 +121,9 @@ generation.
 .. autoclass:: transformers.LogitsProcessorList
    :members: __call__

+.. autoclass:: transformers.LogitsWarper
+    :members: __call__
+
 .. autoclass:: transformers.MinLengthLogitsProcessor
    :members: __call__

@ -40,6 +145,12 @@ generation.
 .. autoclass:: transformers.NoBadWordsLogitsProcessor
    :members: __call__

+.. autoclass:: transformers.PrefixConstrainedLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.HammingDiversityLogitsProcessor
+    :members: __call__
+
 BeamSearch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -48,3 +159,10 @@ BeamSearch

 .. autoclass:: transformers.BeamSearchScorer
    :members: process, finalize
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.top_k_top_p_filtering
+
+.. autofunction:: transformers.tf_top_k_top_p_filtering
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Custom Layers and Utilities
 -----------------------------------------------------------------------------------------------------------------------

@ -79,8 +91,6 @@ TensorFlow loss functions
 TensorFlow Helper Functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: transformers.modeling_tf_utils.cast_bool_to_primitive
-
 .. autofunction:: transformers.modeling_tf_utils.get_initializer

 .. autofunction:: transformers.modeling_tf_utils.keras_serializable
--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for pipelines
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for Tokenizers
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for Trainer
 -----------------------------------------------------------------------------------------------------------------------

@ -10,6 +22,8 @@ Utilities

 .. autoclass:: transformers.EvalPrediction

+.. autoclass:: transformers.EvaluationStrategy
+
 .. autofunction:: transformers.set_seed

 .. autofunction:: transformers.torch_distributed_zero_first
@ -20,8 +34,15 @@ Callbacks internals

 .. autoclass:: transformers.trainer_callback.CallbackHandler

+
 Distributed Evaluation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
    :members:
+
+
+Distributed Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HfArgumentParser
--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Callbacks
 -----------------------------------------------------------------------------------------------------------------------

@ -44,6 +56,8 @@ Here is the list of the available :class:`~transformers.TrainerCallback` in the

 .. autoclass:: transformers.ProgressCallback

+.. autoclass:: transformers.EarlyStoppingCallback
+
 .. autoclass:: transformers.integrations.TensorBoardCallback

 .. autoclass:: transformers.integrations.WandbCallback
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Configuration
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/logging.rst
+++ b/docs/source/main_classes/logging.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Logging
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@ -1,9 +1,22 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Models
 -----------------------------------------------------------------------------------------------------------------------

-The base classes :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` implement the
-common methods for loading/saving a model either from a local file or directory, or from a pretrained model
-configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+The base classes :class:`~transformers.PreTrainedModel`, :class:`~transformers.TFPreTrainedModel`, and
+:class:`~transformers.FlaxPreTrainedModel` implement the common methods for loading/saving a model either from a local
+file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
+S3 repository).

 :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` also implement a few methods which
 are common among all the models to:
@ -45,6 +58,13 @@ TFModelUtilsMixin
    :members:


+FlaxPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPreTrainedModel
+    :members:
+
+
 Generation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Optimization
 -----------------------------------------------------------------------------------------------------------------------

@ -31,6 +43,10 @@ Schedules
 Learning Rate Schedules (Pytorch)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

+.. autoclass:: transformers.SchedulerType
+
+.. autofunction:: transformers.get_scheduler
+
 .. autofunction:: transformers.get_constant_schedule


@ -62,6 +78,10 @@ Learning Rate Schedules (Pytorch)
    :target: /imgs/warmup_linear_schedule.png
    :alt:

+
+.. autofunction:: transformers.get_polynomial_decay_schedule_with_warmup
+
+
 Warmup (TensorFlow)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Model outputs
 -----------------------------------------------------------------------------------------------------------------------

@ -114,13 +126,6 @@ CausalLMOutputWithCrossAttentions
    :members:


-CausalLMOutputWithPastAndCrossAttentions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithPastAndCrossAttentions
-    :members:
-
-
 CausalLMOutputWithPast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Pipelines
 -----------------------------------------------------------------------------------------------------------------------

@ -22,6 +34,7 @@ There are two categories of pipeline abstractions to be aware about:
    - :class:`~transformers.TranslationPipeline`
    - :class:`~transformers.ZeroShotClassificationPipeline`
    - :class:`~transformers.Text2TextGenerationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -61,8 +74,9 @@ FillMaskPipeline
 NerPipeline
 =======================================================================================================================

-This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined below. Please refer to that
-pipeline for documentation and usage examples.
+.. autoclass:: transformers.NerPipeline
+
+See :class:`~transformers.TokenClassificationPipeline` for all details.

 QuestionAnsweringPipeline
 =======================================================================================================================
@ -78,6 +92,13 @@ SummarizationPipeline
    :special-members: __call__
    :members:

+TableQuestionAnsweringPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TableQuestionAnsweringPipeline
+    :special-members: __call__
+
+
 TextClassificationPipeline
 =======================================================================================================================

@ -106,6 +127,13 @@ TokenClassificationPipeline
    :special-members: __call__
    :members:

+TranslationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TranslationPipeline
+    :special-members: __call__
+    :members:
+
 ZeroShotClassificationPipeline
 =======================================================================================================================

--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Processors
 -----------------------------------------------------------------------------------------------------------------------

@ -156,5 +168,5 @@ Using `tensorflow_datasets` is as easy as using a data file:
    )


-Another example using these processors is given in the `run_squad.py
-<https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_squad.py>`__ script.
+Another example using these processors is given in the :prefix_link:`run_squad.py
+<examples/question-answering/run_squad.py>` script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Tokenizer
 -----------------------------------------------------------------------------------------------------------------------

@ -44,6 +56,8 @@ PreTrainedTokenizer
    :special-members: __call__
    :members:

+    .. automethod:: encode
+

 PreTrainedTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -52,6 +66,8 @@ PreTrainedTokenizerFast
    :special-members: __call__
    :members:

+    .. automethod:: encode
+

 BatchEncoding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@ -1,3 +1,15 @@
+..
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Trainer
 -----------------------------------------------------------------------------------------------------------------------

@ -51,6 +63,13 @@ Trainer
    :members:


+Seq2SeqTrainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Seq2SeqTrainer
+    :members: evaluate, predict
+
+
 TFTrainer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -65,8 +84,639 @@ TrainingArguments
    :members:


+Seq2SeqTrainingArguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Seq2SeqTrainingArguments
+    :members:
+
+
 TFTrainingArguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFTrainingArguments
    :members:
+
+
+Trainer Integrations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+
+The :class:`~transformers.Trainer` has been extended to support libraries that may dramatically improve your training
+time and fit much bigger models.
+
+Currently it supports third party solutions, `DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ and `FairScale
+<https://github.com/facebookresearch/fairscale/>`__, which implement parts of the paper `ZeRO: Memory Optimizations
+Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He
+<https://arxiv.org/abs/1910.02054>`__.
+
+This provided support is new and experimental as of this writing.
+
+Installation Notes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As of this writing, both FairScale and Deepspeed require compilation of CUDA C++ code, before they can be used.
+
+While all installation issues should be dealt with through the corresponding GitHub Issues of `FairScale
+<https://github.com/facebookresearch/fairscale/issues>`__ and `Deepspeed
+<https://github.com/microsoft/DeepSpeed/issues>`__, there are a few common issues that one may encounter while building
+any PyTorch extension that needs to build CUDA extensions.
+
+Therefore, if you encounter a CUDA-related build issue while doing one of the following or both:
+
+.. code-block:: bash
+
+    pip install fairscale
+    pip install deepspeed
+
+please, read the following notes first.
+
+In these notes we give examples for what to do when ``pytorch`` has been built with CUDA ``10.2``. If your situation is
+different remember to adjust the version number to the one you are after.
+
+**Possible problem #1:**
+
+While, Pytorch comes with its own CUDA toolkit, to build these two projects you must have an identical version of CUDA
+installed system-wide.
+
+For example, if you installed ``pytorch`` with ``cudatoolkit==10.2`` in the Python environment, you also need to have
+CUDA ``10.2`` installed system-wide.
+
+The exact location may vary from system to system, but ``/usr/local/cuda-10.2`` is the most common location on many
+Unix systems. When CUDA is correctly set up and added to the ``PATH`` environment variable, one can find the
+installation location by doing:
+
+.. code-block:: bash
+
+    which nvcc
+
+If you don't have CUDA installed system-wide, install it first. You will find the instructions by using your favorite
+search engine. For example, if you're on Ubuntu you may want to search for: `ubuntu cuda 10.2 install
+<https://www.google.com/search?q=ubuntu+cuda+10.2+install>`__.
+
+**Possible problem #2:**
+
+Another possible common problem is that you may have more than one CUDA toolkit installed system-wide. For example you
+may have:
+
+.. code-block:: bash
+
+    /usr/local/cuda-10.2
+    /usr/local/cuda-11.0
+
+Now, in this situation you need to make sure that your ``PATH`` and ``LD_LIBRARY_PATH`` environment variables contain
+the correct paths to the desired CUDA version. Typically, package installers will set these to contain whatever the
+last version was installed. If you encounter the problem, where the package build fails because it can't find the right
+CUDA version despite you having it installed system-wide, it means that you need to adjust the 2 aforementioned
+environment variables.
+
+First, you may look at their contents:
+
+.. code-block:: bash
+
+    echo $PATH
+    echo $LD_LIBRARY_PATH
+
+so you get an idea of what is inside.
+
+It's possible that ``LD_LIBRARY_PATH`` is empty.
+
+``PATH`` lists the locations of where executables can be found and ``LD_LIBRARY_PATH`` is for where shared libraries
+are to looked for. In both cases, earlier entries have priority over the later ones. ``:`` is used to separate multiple
+entries.
+
+Now, to tell the build program where to find the specific CUDA toolkit, insert the desired paths to be listed first by
+doing:
+
+.. code-block:: bash
+
+    export PATH=/usr/local/cuda-10.2/bin:$PATH
+    export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
+
+Note that we aren't overwriting the existing values, but prepending instead.
+
+Of course, adjust the version number, the full path if need be. Check that the directories you assign actually do
+exist. ``lib64`` sub-directory is where the various CUDA ``.so`` objects, like ``libcudart.so`` reside, it's unlikely
+that your system will have it named differently, but if it is adjust it to reflect your reality.
+
+
+**Possible problem #3:**
+
+Some older CUDA versions may refuse to build with newer compilers. For example, you my have ``gcc-9`` but it wants
+``gcc-7``.
+
+There are various ways to go about it.
+
+If you can install the latest CUDA toolkit it typically should support the newer compiler.
+
+Alternatively, you could install the lower version of the compiler in addition to the one you already have, or you may
+already have it but it's not the default one, so the build system can't see it. If you have ``gcc-7`` installed but the
+build system complains it can't find it, the following might do the trick:
+
+.. code-block:: bash
+
+    sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
+    sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
+
+
+Here, we are making a symlink to ``gcc-7`` from ``/usr/local/cuda-10.2/bin/gcc`` and since
+``/usr/local/cuda-10.2/bin/`` should be in the ``PATH`` environment variable (see the previous problem's solution), it
+should find ``gcc-7`` (and ``g++7``) and then the build will succeed.
+
+As always make sure to edit the paths in the example to match your situation.
+
+**If still unsuccessful:**
+
+If after addressing these you still encounter build issues, please, proceed with the GitHub Issue of `FairScale
+<https://github.com/facebookresearch/fairscale/issues>`__ and `Deepspeed
+<https://github.com/microsoft/DeepSpeed/issues>`__, depending on the project you have the problem with.
+
+
+FairScale
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By integrating `FairScale <https://github.com/facebookresearch/fairscale/>`__ the :class:`~transformers.Trainer`
+provides support for the following features from `the ZeRO paper <https://arxiv.org/abs/1910.02054>`__:
+
+1. Optimizer State Sharding
+2. Gradient Sharding
+
+You will need at least two GPUs to use this feature.
+
+To deploy this feature:
+
+1. Install the library via pypi:
+
+   .. code-block:: bash
+
+       pip install fairscale
+
+   or find more details on `the FairScale's GitHub page
+   <https://github.com/facebookresearch/fairscale/#installation>`__.
+
+2. Add ``--sharded_ddp`` to the command line arguments, and make sure you have added the distributed launcher ``-m
+   torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+
+For example here is how you could use it for ``finetune_trainer.py`` with 2 GPUs:
+
+.. code-block:: bash
+
+    cd examples/seq2seq
+    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation \
+    --fp16 --sharded_ddp
+
+Notes:
+
+- This feature requires distributed training (so multiple GPUs).
+- It is not implemented for TPUs.
+- It works with ``--fp16`` too, to make things even faster.
+- One of the main benefits of enabling ``--sharded_ddp`` is that it uses a lot less GPU memory, so you should be able
+  to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
+  significantly shorter training time.
+
+
+DeepSpeed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+`DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ implements everything described in the `ZeRO paper
+<https://arxiv.org/abs/1910.02054>`__, except ZeRO's stage 3. "Parameter Partitioning (Pos+g+p)". Currently it provides
+full support for:
+
+1. Optimizer State Partitioning (ZeRO stage 1)
+2. Add Gradient Partitioning (ZeRO stage 2)
+
+Installation
+=======================================================================================================================
+
+Install the library via pypi:
+
+.. code-block:: bash
+
+    pip install deepspeed
+
+or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__.
+
+Deployment with multiple GPUs
+=======================================================================================================================
+
+To deploy this feature with multiple GPUs adjust the :class:`~transformers.Trainer` command line arguments as
+following:
+
+1. replace ``python -m torch.distributed.launch`` with ``deepspeed``.
+2. add a new argument ``--deepspeed ds_config.json``, where ``ds_config.json`` is the DeepSpeed configuration file as
+   documented `here <https://www.deepspeed.ai/docs/config-json/>`__. The file naming is up to you.
+
+Therefore, if your original command line looked as following:
+
+.. code-block:: bash
+
+    python -m torch.distributed.launch --nproc_per_node=2 your_program.py <normal cl args>
+
+Now it should be:
+
+.. code-block:: bash
+
+    deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
+
+Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to use with ``--nproc_per_node``, with the
+``deepspeed`` launcher you don't have to use the corresponding ``--num_gpus`` if you want all of your GPUs used. The
+full details on how to configure various nodes and GPUs can be found `here
+<https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__.
+
+Here is an example of running ``finetune_trainer.py`` under DeepSpeed deploying all available GPUs:
+
+.. code-block:: bash
+
+    cd examples/seq2seq
+    deepspeed ./finetune_trainer.py --deepspeed ds_config.json \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation
+
+Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` - i.e.
+two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments to deal
+with, we combined the two into a single argument.
+
+For some practical usage examples, please, see this `post
+<https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400>`__.
+
+
+
+Deployment with one GPU
+=======================================================================================================================
+
+To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` command line arguments as following:
+
+.. code-block:: bash
+
+    cd examples/seq2seq
+    deepspeed --num_gpus=1 ./finetune_trainer.py --deepspeed ds_config.json \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation
+
+This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU. By default,
+DeepSpeed deploys all GPUs it can see. If you have only 1 GPU to start with, then you don't need this argument. The
+following `documentation <https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__ discusses the
+launcher options.
+
+Why would you want to use DeepSpeed with just one GPU?
+
+1. It has a ZeRO-offload feature which can delegate some computations and memory to the host's CPU and RAM, and thus
+   leave more GPU resources for model's needs - e.g. larger batch size, or enabling a fitting of a very big model which
+   normally won't fit.
+2. It provides a smart GPU memory management system, that minimizes memory fragmentation, which again allows you to fit
+   bigger models and data batches.
+
+While we are going to discuss the configuration in details next, the key to getting a huge improvement on a single GPU
+with DeepSpeed is to have at least the following configuration in the configuration file:
+
+.. code-block:: json
+
+  {
+    "zero_optimization": {
+       "stage": 2,
+       "allgather_partitions": true,
+       "allgather_bucket_size": 2e8,
+       "reduce_scatter": true,
+       "reduce_bucket_size": 2e8,
+       "overlap_comm": true,
+       "contiguous_gradients": true,
+       "cpu_offload": true
+    },
+  }
+
+which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will
+find more details in the discussion below.
+
+For a practical usage example of this type of deployment, please, see this `post
+<https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685>`__.
+
+Configuration
+=======================================================================================================================
+
+For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer
+to the `following documentation <https://www.deepspeed.ai/docs/config-json/>`__.
+
+While you always have to supply the DeepSpeed configuration file, you can configure the DeepSpeed integration in
+several ways:
+
+1. Supply most of the configuration inside the file, and just use a few required command line arguments. This is the
+   recommended way as it puts most of the configuration params in one place.
+2. Supply just the ZeRO configuration params inside the file, and configure the rest using the normal
+   :class:`~transformers.Trainer` command line arguments.
+3. Any variation of the first two ways.
+
+To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
+enables FP16, uses AdamW optimizer and WarmupLR scheduler:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       },
+
+       "optimizer": {
+         "type": "AdamW",
+         "params": {
+           "lr": 3e-5,
+           "betas": [ 0.8, 0.999 ],
+           "eps": 1e-8,
+           "weight_decay": 3e-7
+         }
+       },
+       "zero_allow_untested_optimizer": true,
+
+       "scheduler": {
+         "type": "WarmupLR",
+         "params": {
+           "warmup_min_lr": 0,
+           "warmup_max_lr": 3e-5,
+           "warmup_num_steps": 500
+         }
+       }
+    }
+
+If you already have a command line that you have been using with :class:`transformers.Trainer` args, you can continue
+using those and the :class:`~transformers.Trainer` will automatically convert them into the corresponding DeepSpeed
+configuration at run time. For example, you could use the following configuration file:
+
+.. code-block:: json
+
+    {
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       }
+    }
+
+and the following command line arguments:
+
+.. code-block:: bash
+
+    --learning_rate 3e-5 --warmup_steps 500 --adam_beta1 0.8 --adam_beta2 0.999 --adam_epsilon 1e-8 \
+    --weight_decay 3e-7 --lr_scheduler_type constant_with_warmup --fp16 --fp16_backend amp
+
+to achieve the same configuration as provided by the longer json file in the first example.
+
+When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer`
+to the console, so you can see exactly what the final configuration was passed to it.
+
+Shared Configuration
+=======================================================================================================================
+
+Some configuration information is required by both the :class:`~transformers.Trainer` and DeepSpeed to function
+correctly, therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to
+configure those via the :class:`~transformers.Trainer` command line arguments.
+
+Therefore, the following DeepSpeed configuration params shouldn't be used with the :class:`~transformers.Trainer`:
+
+* ``train_batch_size``
+* ``train_micro_batch_size_per_gpu``
+* ``gradient_accumulation_steps``
+
+as these will be automatically derived from the run time environment and the following 2 command line arguments:
+
+.. code-block:: bash
+
+    --per_device_train_batch_size 8 --gradient_accumulation_steps 2
+
+which are always required to be supplied.
+
+Of course, you will need to adjust the values in this example to your situation.
+
+
+
+ZeRO
+=======================================================================================================================
+
+The ``zero_optimization`` section of the configuration file is the most important part (`docs
+<https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`__), since that is where you define
+which ZeRO stages you want to enable and how to configure them.
+
+.. code-block:: json
+
+    {
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       }
+    }
+
+Notes:
+
+- enabling ``cpu_offload`` should reduce GPU RAM usage (it requires ``"stage": 2``)
+- ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x
+  the ``allgather_bucket_size`` and ``reduce_bucket_size`` values. So if they are set to 5e8, this requires a 9GB
+  footprint (``5e8 x 2Bytes x 2 x 4.5``). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting
+  OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB.
+
+This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
+no equivalent command line arguments.
+
+
+
+Optimizer
+=======================================================================================================================
+
+
+DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus
+recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
+<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
+
+If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
+automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line
+arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``.
+
+Here is an example of the pre-configured ``optimizer`` entry for AdamW:
+
+.. code-block:: json
+
+    {
+       "zero_allow_untested_optimizer": true,
+       "optimizer": {
+           "type": "AdamW",
+           "params": {
+             "lr": 0.001,
+             "betas": [0.8, 0.999],
+             "eps": 1e-8,
+             "weight_decay": 3e-7
+           }
+         }
+    }
+
+Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add
+``zero_allow_untested_optimizer`` flag.
+
+If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
+make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
+
+
+Scheduler
+=======================================================================================================================
+
+DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
+<https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
+
+If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
+the value of ``--lr_scheduler_type`` to configure it. Currently the :class:`~transformers.Trainer` supports only 2 LR
+schedulers that are also supported by DeepSpeed:
+
+* ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup``
+* ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``,
+  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.
+
+In either case, the values of ``--learning_rate`` and ``--warmup_steps`` will be used for the configuration.
+
+In other words, if you don't use the configuration file to set the ``scheduler`` entry, provide either:
+
+.. code-block:: bash
+
+    --lr_scheduler_type constant_with_warmup --learning_rate 3e-5 --warmup_steps 500
+
+or
+
+.. code-block:: bash
+
+    --lr_scheduler_type linear --learning_rate 3e-5 --warmup_steps 500
+
+with the desired values. If you don't pass these arguments, reasonable default values will be used instead.
+
+In the case of WarmupDecayLR ``total_num_steps`` gets set either via the ``--max_steps`` command line argument, or if
+it is not provided, derived automatically at run time based on the environment and the size of the dataset and other
+command line arguments.
+
+Here is an example of the pre-configured ``scheduler`` entry for WarmupLR (``constant_with_warmup`` in the
+:class:`~transformers.Trainer` API):
+
+.. code-block:: json
+
+    {
+       "scheduler": {
+             "type": "WarmupLR",
+             "params": {
+                 "warmup_min_lr": 0,
+                 "warmup_max_lr": 0.001,
+                 "warmup_num_steps": 1000
+             }
+         }
+    }
+
+Automatic Mixed Precision
+=======================================================================================================================
+
+You can work with FP16 in one of the following ways:
+
+1. Pytorch native amp, as documented `here <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
+2. NVIDIA's apex, as documented `here
+   <https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
+
+If you want to use an equivalent of the Pytorch native amp, you can either configure the ``fp16`` entry in the
+configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``.
+
+Here is an example of the ``fp16`` configuration:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+    }
+
+If you want to use NVIDIA's apex instead, you can can either configure the ``amp`` entry in the configuration file, or
+use the following command line arguments: ``--fp16 --fp16_backend apex --fp16_opt_level 01``.
+
+Here is an example of the ``amp`` configuration:
+
+.. code-block:: json
+
+    {
+        "amp": {
+            "enabled": true,
+            "opt_level": "O1"
+        }
+    }
+
+
+
+Gradient Clipping
+=======================================================================================================================
+
+If you don't configure the ``gradient_clipping`` entry in the configuration file, the :class:`~transformers.Trainer`
+will use the value of the ``--max_grad_norm`` command line argument to set it.
+
+Here is an example of the ``gradient_clipping`` configuration:
+
+.. code-block:: json
+
+    {
+        "gradient_clipping": 1.0,
+    }
+
+
+
+Notes
+=======================================================================================================================
+
+* DeepSpeed works with the PyTorch :class:`~transformers.Trainer` but not TF :class:`~transformers.TFTrainer`.
+* While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from `source
+  <https://github.com/microsoft/deepspeed#installation>`__ to best match your hardware and also if you need to enable
+  certain features, like 1-bit Adam, which aren't available in the pypi distribution.
+* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with HuggingFace ``transformers`` - you can
+  use any model with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration
+  instructions <https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.
+
+Main DeepSpeed Resources
+=======================================================================================================================
+
+- `Project's github <https://github.com/microsoft/deepspeed>`__
+- `Usage docs <https://www.deepspeed.ai/getting-started/>`__
+- `API docs <https://deepspeed.readthedocs.io/en/latest/index.html>`__
+- `Blog posts <https://www.microsoft.com/en-us/research/search/?q=deepspeed>`__
+
+Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you
+have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed GitHub
+<https://github.com/microsoft/DeepSpeed/issues>`__.
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@ -1,5 +1,186 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Migrating from previous packages

+## Migrating from transformers `v3.x` to `v4.x`
+
+A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
+expected changes:
+
+#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
+
+The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set. 
+
+This introduces two breaking changes:
+- The handling of overflowing tokens between the python and rust tokenizers is different.
+- The rust tokenizers do not accept integers in the encoding methods.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=textclassification#tokenclassificationpipeline).
+- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
+
+In version `v3.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+to obtain the same in version `v4.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+```
+
+#### 2. SentencePiece is removed from the required dependencies
+
+The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
+
+This includes the **slow** versions of:
+- `XLNetTokenizer`
+- `AlbertTokenizer`
+- `CamembertTokenizer`
+- `MBartTokenizer`
+- `PegasusTokenizer`
+- `T5Tokenizer`
+- `ReformerTokenizer`
+- `XLMRobertaTokenizer`
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
+
+In version `v3.x`:
+```bash
+pip install transformers
+```
+to obtain the same in version `v4.x`:
+```bash
+pip install transformers[sentencepiece]
+```
+or
+```bash
+pip install transformers sentencepiece
+```
+#### 3. The architecture of the repo has been updated so that each model resides in its folder
+
+The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
+
+This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers. 
+
+In version `v3.x`:
+```bash
+from transformers.modeling_bert import BertLayer
+```
+to obtain the same in version `v4.x`:
+```bash
+from transformers.models.bert.modeling_bert import BertLayer
+```
+
+#### 4. Switching the `return_dict` argument to `True` by default
+
+The [`return_dict` argument](https://huggingface.co/transformers/main_classes/output.html) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
+
+This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
+
+In version `v3.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs)
+```
+to obtain the same in version `v4.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs, return_dict=False)
+```
+or
+```bash
+model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
+outputs = model(**inputs)
+```
+
+#### 5. Removed some deprecated attributes
+
+Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
+
+Here is a list of these attributes/methods/arguments and what their replacements should be:
+
+In several models, the labels become consistent with the other models:
+- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
+- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
+- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
+
+In several models, the caching mechanism becomes consistent with the other models:
+- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `past` becomes `past_key_values` in all CTRL models.
+- `past` becomes `past_key_values` in all GPT-2 models.
+
+Regarding the tokenizer classes:
+- The tokenizer attribute `max_len` becomes `model_max_length`.
+- The tokenizer attribute `return_lengths` becomes `return_length`.
+- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
+
+Regarding the `Trainer` class:
+- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
+- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` attribute `data_collator` should be a callable.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
+- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
+- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
+
+Regarding the `TFTrainer` class:
+- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
+- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
+
+Regarding the `TrainerArgument` class:
+- The `TrainerArgument` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
+
+Regarding the Transfo-XL model:
+- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
+- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
+
+Regarding pipelines:
+- The `FillMaskPipeline` argument `topk` becomes `top_k`.
+
+
+
 ## Migrating from pytorch-transformers to 🤗 Transformers

 Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ALBERT
 -----------------------------------------------------------------------------------------------------------------------

@ -48,6 +60,13 @@ AlbertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+AlbertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertTokenizerFast
+    :members:
+
+
 Albert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Auto Classes
 -----------------------------------------------------------------------------------------------------------------------

@ -102,6 +114,13 @@ AutoModelForQuestionAnswering
    :members:


+AutoModelForTableQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForTableQuestionAnswering
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -163,3 +182,10 @@ TFAutoModelForQuestionAnswering

 .. autoclass:: transformers.TFAutoModelForQuestionAnswering
    :members:
+
+
+FlaxAutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModel
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BART
 -----------------------------------------------------------------------------------------------------------------------

@ -30,7 +42,7 @@ Examples
 _______________________________________________________________________________________________________________________

 - Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
-  `examples/seq2seq/ <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
 - An example of how to train :class:`~transformers.BartForConditionalGeneration` with a Hugging Face :obj:`datasets`
  object can be found in this `forum discussion
  <https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904>`__.
@ -43,9 +55,8 @@ Implementation Notes

 - Bart doesn't use :obj:`token_type_ids` for sequence classification. Use :class:`~transformers.BartTokenizer` or
  :meth:`~transformers.BartTokenizer.encode` to get the proper splitting.
- The forward pass of :class:`~transformers.BartModel` will create decoder inputs (using the helper function
-  :func:`transformers.models.bart.modeling_bart._prepare_bart_decoder_inputs`) if they are not passed. This is
-  different than some other modeling APIs.
+- The forward pass of :class:`~transformers.BartModel` will create the ``decoder_input_ids`` if they are not passed.
+  This is different than some other modeling APIs. A typical use case of this feature is mask filling.
 - Model predictions are intended to be identical to the original implementation when
  :obj:`force_bos_token_to_be_generated=True`. This only works, however, if the string you pass to
  :func:`fairseq.encode` starts with a space.
@ -53,7 +64,6 @@ Implementation Notes
  summarization, see the example in that docstrings.
 - Models that load the `facebook/bart-large-cnn` weights will not have a :obj:`mask_token_id`, or be able to perform
  mask-filling tasks.
- For training/forward passes that don't involve beam search, pass :obj:`use_cache=False`.

 Mask Filling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -86,6 +96,12 @@ BartTokenizer
    :members:


+BartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizerFast
+    :members:
+

 BartModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -93,8 +109,6 @@ BartModel
 .. autoclass:: transformers.BartModel
    :members: forward

-.. autofunction:: transformers.models.bart.modeling_bart._prepare_bart_decoder_inputs
-

 BartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -116,6 +130,12 @@ BartForQuestionAnswering
 .. autoclass:: transformers.BartForQuestionAnswering
    :members: forward

+BartForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForCausalLM
+    :members: forward
+


 TFBartModel
--- a/docs/source/model_doc/barthez.rst
+++ b/docs/source/model_doc/barthez.rst
@ -0,0 +1,59 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BARThez
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BARThez model was proposed in `BARThez: a Skilled Pretrained French Sequence-to-Sequence Model`
+<https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
+2020.
+
+The abstract of the paper:
+
+
+*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
+(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
+understanding tasks. While there are some notable exceptions, most of the available models and research have been
+conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
+(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
+that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
+CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
+its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
+summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
+pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
+provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
+
+The Authors' code can be found `here <https://github.com/moussaKam/BARThez>`__.
+
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+
+
+BarthezTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizer
+    :members:
+
+
+BarthezTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizerFast
+    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BERT
 -----------------------------------------------------------------------------------------------------------------------

@ -195,3 +207,10 @@ FlaxBertModel

 .. autoclass:: transformers.FlaxBertModel
    :members: __call__
+
+
+FlaxBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForMaskedLM
+    :members: __call__
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BertGeneration
 -----------------------------------------------------------------------------------------------------------------------

@ -10,7 +22,7 @@ Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Ali

 The abstract from the paper is the following:

-*Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By
+*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
 warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
 benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
 Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@ -0,0 +1,64 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Bertweet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BERTweet model was proposed in `BERTweet: A pre-trained language model for English Tweets
+<https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf>`__ by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having
+the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et
+al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al.,
+2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks:
+Part-of-speech tagging, Named-entity recognition and text classification.*
+
+Example of use:
+
+.. code-block::
+
+  import torch
+  from transformers import AutoModel, AutoTokenizer 
+
+  bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+
+  # For transformers v4.x+: 
+  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+
+  # For transformers v3.x: 
+  # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+
+  # INPUT TWEET IS ALREADY NORMALIZED!
+  line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+
+  input_ids = torch.tensor([tokenizer.encode(line)])
+
+  with torch.no_grad():
+      features = bertweet(input_ids)  # Models outputs are now tuples
+
+  ## With TensorFlow 2.0+:
+  # from transformers import TFAutoModel
+  # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+
+
+The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
+
+BertweetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertweetTokenizer
+    :members: 
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Blenderbot
 -----------------------------------------------------------------------------------------------------------------------

@ -31,13 +43,10 @@ Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 - Blenderbot uses a standard `seq2seq model transformer <https://arxiv.org/pdf/1706.03762.pdf>`__ based architecture.
- It inherits completely from :class:`~transformers.BartForConditionalGeneration`
- Even though blenderbot is one model, it uses two tokenizers :class:`~transformers.BlenderbotSmallTokenizer` for 90M
-  checkpoint and :class:`~transformers.BlenderbotTokenizer` for all other checkpoints.
- :class:`~transformers.BlenderbotSmallTokenizer` will always return :class:`~transformers.BlenderbotSmallTokenizer`,
-  regardless of checkpoint. To use the 3B parameter checkpoint, you must call
-  :class:`~transformers.BlenderbotTokenizer` directly.
 - Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
+- This is the `default` Blenderbot model class. However, some smaller checkpoints, such as
+  ``facebook/blenderbot_small_90M``, have a different architecture and consequently should be used with
+  `BlenderbotSmall <https://huggingface.co/transformers/master/model_doc/blenderbot_small.html>`__.


 Usage
@ -47,26 +56,15 @@ Here is an example of model usage:

 .. code-block::

-        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotForConditionalGeneration
-        >>> mname = 'facebook/blenderbot-90M'
+        >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+        >>> mname = 'facebook/blenderbot-400M-distill'
        >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
        >>> UTTERANCE = "My friends are cool but they eat too many carbs."
        >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
        >>> reply_ids = model.generate(**inputs)
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in reply_ids])
-
-
-Here is how you can check out config values:
-
-.. code-block::
-
-
-        >>> from transformers import BlenderbotConfig
-        >>> config_90 = BlenderbotConfig.from_pretrained("facebook/blenderbot-90M")
-        >>> config_90.to_diff_dict()  # show interesting Values.
-        >>> configuration_3B = BlenderbotConfig("facebook/blenderbot-3B")
-        >>> configuration_3B.to_diff_dict()
+        >>> print(tokenizer.batch_decode(reply_ids))
+        ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]


 BlenderbotConfig
@ -81,11 +79,14 @@ BlenderbotTokenizer
 .. autoclass:: transformers.BlenderbotTokenizer
    :members: build_inputs_with_special_tokens

-BlenderbotSmallTokenizer
+
+BlenderbotModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.BlenderbotSmallTokenizer
-    :members:
+See :obj:`transformers.BartModel` for arguments to `forward` and `generate`
+
+.. autoclass:: transformers.BlenderbotModel
+    :members: forward


 BlenderbotForConditionalGeneration
@ -94,13 +95,25 @@ BlenderbotForConditionalGeneration
 See :obj:`transformers.BartForConditionalGeneration` for arguments to `forward` and `generate`

 .. autoclass:: transformers.BlenderbotForConditionalGeneration
-    :members:
+    :members: forward
+
+
+BlenderbotForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotForCausalLM
+    :members: forward
+
+
+TFBlenderbotModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotModel
+    :members: call


 TFBlenderbotForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-See :obj:`transformers.TFBartForConditionalGeneration` for arguments to `forward` and `generate`
-
 .. autoclass:: transformers.TFBlenderbotForConditionalGeneration
-    :members:
+    :members: call
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@ -0,0 +1,91 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Blenderbot Small
+-----------------------------------------------------------------------------------------------------------------------
+
+Note that :class:`~transformers.BlenderbotSmallModel` and
+:class:`~transformers.BlenderbotSmallForConditionalGeneration` are only used in combination with the checkpoint
+`facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__. Larger Blenderbot checkpoints should
+instead be used with :class:`~transformers.BlenderbotModel` and
+:class:`~transformers.BlenderbotForConditionalGeneration`
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Blender chatbot model was proposed in `Recipes for building an open-domain chatbot
+<https://arxiv.org/pdf/2004.13637.pdf>`__ Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+
+BlenderbotSmallConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallConfig
+    :members:
+
+
+BlenderbotSmallTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BlenderbotSmallModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallModel
+    :members: forward
+
+
+BlenderbotSmallForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallForConditionalGeneration
+    :members: forward
+
+
+BlenderbotSmallForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallForCausalLM
+    :members: forward
+
+
+TFBlenderbotSmallModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotSmallModel
+    :members: call
+
+
+TFBlenderbotSmallForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotSmallForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/bort.rst
+++ b/docs/source/model_doc/bort.rst
@ -0,0 +1,46 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BORT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BORT model was proposed in `Optimal Subarchitecture Extraction for BERT <https://arxiv.org/abs/2010.10499>`__ by
+Adrian de Wynter and Daniel J. Perry. It is an optimal subset of architectural parameters for the BERT, which the
+authors refer to as "Bort".
+
+The abstract from the paper is the following:
+
+*We extract an optimal subset of architectural parameters for the BERT architecture from Devlin et al. (2018) by
+applying recent breakthroughs in algorithms for neural architecture search. This optimal subset, which we refer to as
+"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the
+original BERT-large architecture, and 16% of the net size. Bort is also able to be pretrained in 288 GPU hours, which
+is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large
+(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same
+hardware. It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the
+architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
+absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
+
+Tips:
+
+- BORT's model architecture is based on BERT, so one can refer to :doc:`BERT's documentation page <bert>` for the
+  model's API as well as usage examples.
+- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, so one can refer to :doc:`RoBERTa's documentation page
+  <roberta>` for the tokenizer's API as well as usage examples.
+- BORT requires a specific fine-tuning algorithm, called `Agora
+  <https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology>`__ ,
+  that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
+  algorithm to make BORT fine-tuning work.
+
+The original code can be found `here <https://github.com/alexa/bort/>`__.
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 CamemBERT
 -----------------------------------------------------------------------------------------------------------------------

@ -42,6 +54,13 @@ CamembertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+CamembertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertTokenizerFast
+    :members:
+
+
 CamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/convbert.rst
+++ b/docs/source/model_doc/convbert.rst
@ -0,0 +1,144 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+ConvBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ConvBERT model was proposed in `ConvBERT: Improving BERT with Span-based Dynamic Convolution
+<https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng
+Yan.
+
+The abstract from the paper is the following:
+
+*Pre-trained language models like BERT and its variants have recently achieved impressive performance in various
+natural language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers
+large memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
+generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
+which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
+replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
+rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
+learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
+ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
+fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
+using less than 1/4 training cost. Code and pre-trained models will be released.*
+
+ConvBERT training tips are similar to those of BERT. The original implementation can be found here:
+https://github.com/yitu-opensource/ConvBert
+
+ConvBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertConfig
+    :members:
+
+
+ConvBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+ConvBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+ConvBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertModel
+    :members: forward
+
+
+ConvBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertForMaskedLM
+    :members: forward
+
+
+ConvBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertForSequenceClassification
+    :members: forward
+
+
+ConvBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertForMultipleChoice
+    :members: forward
+
+
+ConvBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertForTokenClassification
+    :members: forward
+
+
+ConvBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ConvBertForQuestionAnswering
+    :members: forward
+
+
+TFConvBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFConvBertModel
+    :members: call
+
+
+TFConvBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFConvBertForMaskedLM
+    :members: call
+
+
+TFConvBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFConvBertForSequenceClassification
+    :members: call
+
+
+TFConvBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFConvBertForMultipleChoice
+    :members: call
+
+
+TFConvBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFConvBertForTokenClassification
+    :members: call
+
+
+TFConvBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFConvBertForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 CTRL
 -----------------------------------------------------------------------------------------------------------------------

@ -65,6 +77,13 @@ CTRLLMHeadModel
    :members: forward


+CTRLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLForSequenceClassification
+    :members: forward
+
+
 TFCTRLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -78,3 +97,8 @@ TFCTRLLMHeadModel
 .. autoclass:: transformers.TFCTRLLMHeadModel
    :members: call

+TFCTRLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCTRLForSequenceClassification
+    :members: call
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DeBERTa
 -----------------------------------------------------------------------------------------------------------------------

@ -20,8 +32,8 @@ disentangled attention mechanism, where each word is represented using two vecto
 position, respectively, and the attention weights among words are computed using disentangled matrices on their
 contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
 predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pre-training and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half
-of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
+the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
 (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
 pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*

@ -58,8 +70,29 @@ DebertaPreTrainedModel
    :members:


+DebertaForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaForMaskedLM
+    :members:
+
+
 DebertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForSequenceClassification
    :members:
+
+
+DebertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaForTokenClassification
+    :members:
+
+
+DebertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DialoGPT
 -----------------------------------------------------------------------------------------------------------------------

@ -36,7 +48,6 @@ modeling. We first concatenate all dialog turns within a dialogue session into a
 sequence length), ended by the end-of-text token.* For more information please confer to the original paper.


-DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring
-<https://huggingface.co/transformers/model_doc/gpt2.html>`_.
+DialoGPT's architecture is based on the GPT2 model, so one can refer to :doc:`GPT2's documentation page <gpt2>`.

 The original code can be found `here <https://github.com/microsoft/DialoGPT>`_.
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DistilBERT
 -----------------------------------------------------------------------------------------------------------------------

@ -18,9 +30,9 @@ operating these large models in on-the-edge and/or under constrained computation
 remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
 model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
 counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
-knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by
+knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
-biases learned by larger models during pre-training, we introduce a triple loss combining language modeling,
+biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
 distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
 demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
 study.*
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DPR
 -----------------------------------------------------------------------------------------------------------------------

@ -5,7 +17,7 @@ Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
-intorduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
+introduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.

 The abstract from the paper is the following:
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ELECTRA
 -----------------------------------------------------------------------------------------------------------------------

@ -12,14 +24,14 @@ identify which tokens were replaced by the generator in the sequence.

 The abstract from the paper is the following:

-*Masked language modeling (MLM) pre-training methods such as BERT corrupt the input by replacing some tokens with
-[MASK] and then train a model to reconstruct the original tokens. While they produce good results when transferred to
+*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
+and then train a model to reconstruct the original tokens. While they produce good results when transferred to
 downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
-more sample-efficient pre-training task called replaced token detection. Instead of masking the input, our approach
+more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
 corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
 of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
 predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
-demonstrate this new pre-training task is more efficient than MLM because the task is defined over all input tokens
+demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
 rather than just the small subset that was masked out. As a result, the contextual representations learned by our
 approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
 particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Encoder Decoder Models
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 FlauBERT
 -----------------------------------------------------------------------------------------------------------------------

@ -19,7 +31,7 @@ representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018;
 heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
 Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
 classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
-time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified evaluation
+time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
 protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
 community for further reproducible experiments in French NLP.*

--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 FSMT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/funnel.rst
+++ b/docs/source/model_doc/funnel.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Funnel Transformer
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 OpenAI GPT
 -----------------------------------------------------------------------------------------------------------------------

@ -14,7 +26,7 @@ The abstract from the paper is the following:
 *Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
 semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
 labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
-perform adequately. We demonstrate that large gains on these tasks can be realized by generative pre-training of a
+perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
 language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
 contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
 effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
@ -126,3 +138,9 @@ TFOpenAIGPTDoubleHeadsModel

 .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
    :members: call
+
+TFOpenAIGPTForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTForSequenceClassification
+    :members: call
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 OpenAI GPT2
 -----------------------------------------------------------------------------------------------------------------------

@ -71,14 +83,14 @@ GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Model
-    :members: forward
+    :members: forward, parallelize, deparallelize


 GPT2LMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2LMHeadModel
-    :members: forward
+    :members: forward, parallelize, deparallelize


 GPT2DoubleHeadsModel
@ -114,3 +126,15 @@ TFGPT2DoubleHeadsModel

 .. autoclass:: transformers.TFGPT2DoubleHeadsModel
    :members: call
+
+TFGPT2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2ForSequenceClassification
+    :members: call
+
+TFSequenceClassifierOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSequenceClassifierOutputWithPast
+    :members:
--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@ -0,0 +1,71 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+herBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The herBERT model was proposed in `KLEJ: Comprehensive Benchmark for Polish Language Understanding
+<https://www.aclweb.org/anthology/2020.acl-main.111.pdf>`__ by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and
+Ireneusz Gawlik. It is a BERT-based Language Model trained on Polish Corpora using only MLM objective with dynamic
+masking of whole words.
+
+The abstract from the paper is the following:
+
+*In recent years, a series of Transformer-based models unlocked major improvements in general natural language
+understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which
+allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of
+languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language
+understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing
+datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new
+sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and
+promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and
+applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language,
+which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an
+extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based
+models.*
+
+Examples of use:
+
+.. code-block::
+
+  from transformers import HerbertTokenizer, RobertaModel
+
+  tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+
+  encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+  outputs = model(encoded_input)
+
+  # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+  import torch
+  from transformers import AutoModel, AutoTokenizer
+
+  tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+
+
+The original code can be found `here <https://github.com/allegro/HerBERT>`__.
+
+HerbertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HerbertTokenizer
+    :members: 
+
+HerbertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HerbertTokenizerFast
+    :members: 
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@ -1,32 +1,84 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 LayoutLM
 -----------------------------------------------------------------------------------------------------------------------

+.. _Overview:
+
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 The LayoutLM model was proposed in the paper `LayoutLM: Pre-training of Text and Layout for Document Image
 Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
-Ming Zhou. It's a simple but effective pre-training method of text and layout for document image understanding and
-information extraction tasks, such as form understanding and receipt understanding.
+Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
+information extraction tasks, such as form understanding and receipt understanding. It obtains state-of-the-art results
+on several downstream tasks:
+
+- form understanding: the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__ dataset (a collection of 199 annotated
+  forms comprising more than 30,000 words).
+- receipt understanding: the `SROIE <https://rrc.cvc.uab.es/?ch=13>`__ dataset (a collection of 626 receipts for
+  training and 347 receipts for testing).
+- document image classification: the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset (a collection of
+  400,000 images belonging to one of 16 classes).

 The abstract from the paper is the following:

 *Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
-widespread use of pre-training models for NLP applications, they almost exclusively focus on text-level manipulation,
+widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
 while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
-the \textbf{LayoutLM} to jointly model interactions between text and layout information across scanned document images,
-which is beneficial for a great number of real-world document image understanding tasks such as information extraction
-from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into
-LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single
-framework for document-level pre-training. It achieves new state-of-the-art results in several downstream tasks,
-including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image
-classification (from 93.07 to 94.42).*
+the LayoutLM to jointly model interactions between text and layout information across scanned document images, which is
+beneficial for a great number of real-world document image understanding tasks such as information extraction from
+scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM.
+To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for
+document-level pretraining. It achieves new state-of-the-art results in several downstream tasks, including form
+understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification
+(from 93.07 to 94.42).*

 Tips:

- LayoutLM has an extra input called :obj:`bbox`, which is the bounding boxes of the input tokens.
- The :obj:`bbox` requires the data that on 0-1000 scale, which means you should normalize the bounding box before
-  passing them into model.
+- In addition to `input_ids`, :meth:`~transformer.LayoutLMModel.forward` also expects the input :obj:`bbox`, which are
+  the bounding boxes (i.e. 2D-positions) of the input tokens. These can be obtained using an external OCR engine such
+  as Google's `Tesseract <https://github.com/tesseract-ocr/tesseract>`__ (there's a `Python wrapper
+  <https://pypi.org/project/pytesseract/>`__ available). Each bounding box should be in (x0, y0, x1, y1) format, where
+  (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1) represents the
+  position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on a 0-1000
+  scale. To normalize, you can use the following function:
+
+.. code-block::
+
+   def normalize_bbox(bbox, width, height):
+        return [
+            int(1000 * (bbox[0] / width)),
+            int(1000 * (bbox[1] / height)),
+            int(1000 * (bbox[2] / width)),
+            int(1000 * (bbox[3] / height)),
+        ]
+
+Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
+occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:
+
+.. code-block::
+
+   from PIL import Image
+
+   image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+
+   width, height = image.size
+
+- For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
+  <https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
+  <https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb>`__.
+  It includes an inference part, which shows how to use Google's Tesseract on a new document.

 The original code can be found `here <https://github.com/microsoft/unilm/tree/master/layoutlm>`_.

@ -45,6 +97,13 @@ LayoutLMTokenizer
    :members:


+LayoutLMTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMTokenizerFast
+    :members:
+
+
 LayoutLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -59,6 +118,13 @@ LayoutLMForMaskedLM
    :members:


+LayoutLMForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMForSequenceClassification
+    :members:
+
+
 LayoutLMForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@ -0,0 +1,149 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LED
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LED model was proposed in `Longformer: The Long-Document Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz
+Beltagy, Matthew E. Peters, Arman Cohan.
+
+The abstract from the paper is the following:
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
+quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
+mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
+longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
+windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
+evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
+contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
+pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
+WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting
+long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
+dataset.*
+
+Tips:
+
+- :class:`~transformers.LEDForConditionalGeneration` is an extension of
+  :class:`~transformers.BartForConditionalGeneration` exchanging the traditional *self-attention* layer with
+  *Longformer*'s *chunked self-attention* layer. :class:`~transformers.LEDTokenizer` is an alias of
+  :class:`~transformers.BartTokenizer`.
+- LED works very well on long-range *sequence-to-sequence* tasks where the ``input_ids`` largely exceed a length of
+  1024 tokens.
+- LED pads the ``input_ids`` to be a multiple of ``config.attention_window`` if required. Therefore a small speed-up is
+  gained, when :class:`~transformers.LEDTokenizer` is used with the ``pad_to_multiple_of`` argument.
+- LED makes use of *global attention* by means of the ``global_attention_mask`` (see
+  :class:`~transformers.LongformerModel`). For summarization, it is advised to put *global attention* only on the first
+  ``<s>`` token. For question answering, it is advised to put *global attention* on all tokens of the question.
+- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by setting
+  ``config.gradient_checkpointing = True``.
+- A notebook showing how to evaluate LED, can be accessed `here
+  <https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing>`__.
+- A notebook showing how to fine-tune LED, can be accessed `here
+  <https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing>`__.
+
+
+LEDConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDConfig
+    :members:
+
+
+LEDTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LEDTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LED specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.led.modeling_led.LEDEncoderBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDEncoderBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDSeq2SeqModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDSeq2SeqLMOutput
+    :members: 
+
+
+
+
+LEDModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDModel
+    :members: forward
+
+
+LEDForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForConditionalGeneration
+    :members: forward
+
+
+LEDForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForSequenceClassification
+    :members: forward
+
+
+LEDForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForQuestionAnswering
+    :members: forward
+
+
+TFLEDModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLEDModel
+    :members: call
+
+
+TFLEDForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLEDForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Longformer
 -----------------------------------------------------------------------------------------------------------------------

@ -22,6 +34,12 @@ contrast to most prior work, we also pretrain Longformer and finetune it on a va
 pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
 WikiHop and TriviaQA.*

+Tips:
+
+- Since the Longformer is based on RoBERTa, it doesn't have :obj:`token_type_ids`. You don't need to indicate which
+  token belongs to which segment. Just separate your segments with the separation token :obj:`tokenizer.sep_token` (or
+  :obj:`</s>`).
+
 The Authors' code can be found `here <https://github.com/allenai/longformer>`__.

 Longformer Self Attention
--- a/docs/source/model_doc/lxmert.rst
+++ b/docs/source/model_doc/lxmert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 LXMERT
 -----------------------------------------------------------------------------------------------------------------------

@ -19,7 +31,7 @@ Encoder Representations from Transformers) framework to learn these vision-and-l
 build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
 encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
 semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
-pre-training tasks: masked language modeling, masked object prediction (feature regression and label classification),
+pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
 cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
 cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
 results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MarianMT
 -----------------------------------------------------------------------------------------------------------------------

@ -21,7 +33,6 @@ Implementation Notes
 - The modeling code is the same as :class:`~transformers.BartForConditionalGeneration` with a few minor modifications:

    - static (sinusoid) positional embeddings (:obj:`MarianConfig.static_position_embeddings=True`)
-    - a new final_logits_bias (:obj:`MarianConfig.add_bias_logits=True`)
    - no layernorm_embedding (:obj:`MarianConfig.normalize_embedding=False`)
    - the model starts generating with :obj:`pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
      :obj:`<s/>`),
@ -44,12 +55,10 @@ Examples

 - Since Marian models are smaller than many other translation models available in the library, they can be useful for
  fine-tuning experiments and integration tests.
- `Fine-tune on TPU
-  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh>`__
 - `Fine-tune on GPU
-  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh>`__
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_enro_teacher.sh>`__
 - `Fine-tune on GPU with pytorch-lightning
-  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/distil_marian_no_teacher.sh>`__
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_no_teacher.sh>`__

 Multilingual Models
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -170,13 +179,36 @@ MarianTokenizer
    :members: prepare_seq2seq_batch


+MarianModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianModel
+    :members: forward
+
+
 MarianMTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.MarianMTModel
+    :members: forward
+
+
+MarianForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianForCausalLM
+    :members: forward
+
+
+TFMarianModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMarianModel
+    :members: call


 TFMarianMTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFMarianMTModel
+    :members: call
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MBart
 -----------------------------------------------------------------------------------------------------------------------

@ -13,7 +25,7 @@ The MBart model was presented in `Multilingual Denoising Pre-training for Neural
 Ghazvininejad, Mike Lewis, Luke Zettlemoyer.

 According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
-corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete
+corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
 sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
 on the encoder, decoder, or reconstructing parts of the text.

@ -23,7 +35,7 @@ Examples
 _______________________________________________________________________________________________________________________

 - Examples and scripts for fine-tuning mBART and other models for sequence to sequence tasks can be found in
-  `examples/seq2seq/ <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
 - Given the large embeddings table, mBART consumes a large amount of GPU RAM, especially for fine-tuning.
  :class:`MarianMTModel` is usually a better choice for bilingual machine translation.

@ -78,6 +90,20 @@ MBartTokenizer
    :members: build_inputs_with_special_tokens, prepare_seq2seq_batch


+MBartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartTokenizerFast
+    :members:
+
+
+MBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartModel
+    :members:
+
+
 MBartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -85,8 +111,35 @@ MBartForConditionalGeneration
    :members:


+MBartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForQuestionAnswering
+    :members:
+
+
+MBartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForSequenceClassification
+
+
+MBartForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForCausalLM
+    :members: forward
+
+
+TFMBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMBartModel
+    :members: call
+
+
 TFMBartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFMBartForConditionalGeneration
-    :members:
+    :members: call
--- a/docs/source/model_doc/mobilebert.rst
+++ b/docs/source/model_doc/mobilebert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MobileBERT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/mpnet.rst
+++ b/docs/source/model_doc/mpnet.rst
@ -0,0 +1,149 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MPNet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MPNet model was proposed in `MPNet: Masked and Permuted Pre-training for Language Understanding
+<https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+
+MPNet adopts a novel pre-training method, named masked and permuted language modeling, to inherit the advantages of
+masked language modeling and permuted language modeling for natural language understanding.
+
+The abstract from the paper is the following:
+
+*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
+Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
+pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence and
+thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet, a novel
+pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet leverages the
+dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position
+information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in
+XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of
+down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and PLM by a large
+margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
+BERT, XLNet, RoBERTa) under the same model setting.*
+
+Tips:
+
+- MPNet doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. just
+  separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`[sep]`).
+
+The original code can be found `here <https://github.com/microsoft/MPNet>`__.
+
+MPNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetConfig
+    :members:
+
+
+MPNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+MPNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetTokenizerFast
+    :members:
+
+
+MPNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetModel
+    :members: forward
+
+
+MPNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForMaskedLM
+    :members: forward
+
+
+MPNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForSequenceClassification
+    :members: forward
+
+
+MPNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForMultipleChoice
+    :members: forward
+
+
+MPNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForTokenClassification
+    :members: forward
+
+
+MPNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForQuestionAnswering
+    :members: forward
+
+
+TFMPNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetModel
+    :members: call
+
+
+TFMPNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForMaskedLM
+    :members: call
+
+
+TFMPNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForSequenceClassification
+    :members: call
+
+
+TFMPNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForMultipleChoice
+    :members: call
+
+
+TFMPNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForTokenClassification
+    :members: call
+
+
+TFMPNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MT5
 -----------------------------------------------------------------------------------------------------------------------

@ -25,6 +37,22 @@ MT5Config
    :members:


+MT5Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5Tokenizer
+
+See :class:`~transformers.T5Tokenizer` for all details.
+
+
+MT5TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5TokenizerFast
+
+See :class:`~transformers.T5TokenizerFast` for all details.
+
+
 MT5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -39,6 +67,13 @@ MT5ForConditionalGeneration
    :members:


+MT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5EncoderModel
+    :members:
+
+
 TFMT5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -51,3 +86,10 @@ TFMT5ForConditionalGeneration

 .. autoclass:: transformers.TFMT5ForConditionalGeneration
    :members:
+
+
+TFMT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMT5EncoderModel
+    :members:
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Pegasus
 -----------------------------------------------------------------------------------------------------------------------

@ -39,9 +51,8 @@ All the `checkpoints <https://huggingface.co/models?search=pegasus>`__ are fine-
 Examples
 _______________________________________________________________________________________________________________________

- `Script <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/finetune_pegasus_xsum.sh>`__ to
-  fine-tune pegasus on the XSUM dataset. Data download instructions at `examples/seq2seq/
-  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+- :prefix_link:`Script <examples/seq2seq/finetune_pegasus_xsum.sh>` to fine-tune pegasus on the XSUM dataset. Data
+  download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
 - FP16 is not supported (help/ideas on this appreciated!).
 - The adafactor optimizer is recommended for pegasus fine-tuning.

@ -54,7 +65,6 @@ Implementation Notes
 - Some key configuration differences:

    - static, sinusoidal position embeddings
-    - no :obj:`layernorm_embedding` (:obj:`PegasusConfig.normalize_embedding=False`)
    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
    - more beams are used (:obj:`num_beams=8`)
 - All pretrained pegasus checkpoints are the same besides three attributes: :obj:`tokenizer.model_max_length` (maximum
@ -100,13 +110,43 @@ warning: ``add_tokens`` does not work at the moment.
    :members: __call__, prepare_seq2seq_batch


+PegasusTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusTokenizerFast
+    :members:
+
+
+PegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusModel
+    :members: forward
+
+
 PegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PegasusForConditionalGeneration
+    :members: forward
+
+
+PegasusForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusForCausalLM
+    :members: forward
+
+
+TFPegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFPegasusModel
+    :members: call


 TFPegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFPegasusForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/phobert.rst
+++ b/docs/source/model_doc/phobert.rst
@ -0,0 +1,59 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+PhoBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The PhoBERT model was proposed in `PhoBERT: Pre-trained language models for Vietnamese
+<https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf>`__ by Dat Quoc Nguyen, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual
+language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent
+best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple
+Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and
+Natural language inference.*
+
+Example of use:
+
+.. code-block::
+
+  import torch
+  from transformers import AutoModel, AutoTokenizer
+
+  phobert = AutoModel.from_pretrained("vinai/phobert-base")
+  tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+
+  # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+  line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+
+  input_ids = torch.tensor([tokenizer.encode(line)])
+
+  with torch.no_grad():
+      features = phobert(input_ids)  # Models outputs are now tuples
+
+  ## With TensorFlow 2.0+:
+  # from transformers import TFAutoModel
+  # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+
+
+The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
+
+PhobertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PhobertTokenizer
+    :members: 
--- a/docs/source/model_doc/prophetnet.rst
+++ b/docs/source/model_doc/prophetnet.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ProphetNet
 -----------------------------------------------------------------------------------------------------------------------

@ -17,7 +29,7 @@ the next token.

 The abstract from the paper is the following:

-*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
 self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
 the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
 n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
@ -25,7 +37,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
 overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
 dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
 abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*

 The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.

--- a/docs/source/model_doc/rag.rst
+++ b/docs/source/model_doc/rag.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RAG
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Reformer
 -----------------------------------------------------------------------------------------------------------------------

@ -151,6 +163,13 @@ ReformerTokenizer
    :members: save_vocabulary


+ReformerTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerTokenizerFast
+    :members:
+
+
 ReformerModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/retribert.rst
+++ b/docs/source/model_doc/retribert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RetriBERT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RoBERTa
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/squeezebert.rst
+++ b/docs/source/model_doc/squeezebert.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 SqueezeBERT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 T5
 -----------------------------------------------------------------------------------------------------------------------

@ -17,7 +29,7 @@ The abstract from the paper is the following:
 task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
 has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
 transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
-text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer
+text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
 approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
 with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
 summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
@ -32,9 +44,9 @@ Tips:

  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
  <https://arxiv.org/pdf/1910.10683.pdf>`__. - For sequence-to-sequence generation, it is recommended to use
-  :obj:`T5ForConditionalGeneration.generate()``. This method takes care of feeding the encoded input via
-  cross-attention layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative scalar
-  embeddings. Encoder input padding can be done on the left and on the right.
+  :obj:`T5ForConditionalGeneration.generate()`. This method takes care of feeding the encoded input via cross-attention
+  layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative scalar embeddings.
+  Encoder input padding can be done on the left and on the right.

 The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`__.

@ -43,7 +55,7 @@ Training

 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
 forcing. This means that for training we always need an input sequence and a target sequence. The input sequence is fed
-to the model using :obj:`input_ids``. The target sequence is shifted to the right, i.e., prepended by a start-sequence
+to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a start-sequence
 token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target sequence is then
 appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the start-sequence
 token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
@ -95,19 +107,31 @@ T5Tokenizer
        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary


+T5TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5TokenizerFast
+    :members:
+
+
 T5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.T5Model
-    :members: forward
+    :members: forward, parallelize, deparallelize


 T5ForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.T5ForConditionalGeneration
-    :members: forward
+    :members: forward, parallelize, deparallelize

+T5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5EncoderModel
+    :members: forward, parallelize, deparallelize

 TFT5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -121,3 +145,9 @@ TFT5ForConditionalGeneration

 .. autoclass:: transformers.TFT5ForConditionalGeneration
    :members: call
+
+TFT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFT5EncoderModel
+    :members: call
--- a/docs/source/model_doc/tapas.rst
+++ b/docs/source/model_doc/tapas.rst
@ -0,0 +1,434 @@
+TAPAS
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+    breaking changes to fix them in the future.
+
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The TAPAS model was proposed in `TAPAS: Weakly Supervised Table Parsing via Pre-training
+<https://www.aclweb.org/anthology/2020.acl-main.398>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
+Francesco Piccinno and Julian Martin Eisenschlos. It's a BERT-based model specifically designed (and pre-trained) for
+answering questions about tabular data. Compared to BERT, TAPAS uses relative position embeddings and has 7 token types
+that encode tabular structure. TAPAS is pre-trained on the masked language modeling (MLM) objective on a large dataset
+comprising millions of tables from English Wikipedia and corresponding texts. For question answering, TAPAS has 2 heads
+on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or
+summing) among selected cells. TAPAS has been fine-tuned on several datasets: `SQA
+<https://www.microsoft.com/en-us/download/details.aspx?id=54253>`__ (Sequential Question Answering by Microsoft), `WTQ
+<https://github.com/ppasupat/WikiTableQuestions>`__ (Wiki Table Questions by Stanford University) and `WikiSQL
+<https://github.com/salesforce/WikiSQL>`__ (by Salesforce). It achieves state-of-the-art on both SQA and WTQ, while
+having comparable performance to SOTA on WikiSQL, with a much simpler architecture.
+
+The abstract from the paper is the following:
+
+*Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the
+collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations
+instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition,
+the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we
+present TAPAS, an approach to question answering over tables without generating logical forms. TAPAS trains from weak
+supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation
+operator to such selection. TAPAS extends BERT's architecture to encode tables as input, initializes from an effective
+joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with
+three different semantic parsing datasets, and find that TAPAS outperforms or rivals semantic parsing models by
+improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WIKISQL
+and WIKITQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our
+setting, from WIKISQL to WIKITQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.*
+
+In addition, the authors have further pre-trained TAPAS to recognize **table entailment**, by creating a balanced
+dataset of millions of automatically created training examples which are learned in an intermediate step prior to
+fine-tuning. The authors of TAPAS call this further pre-training intermediate pre-training (since TAPAS is first
+pre-trained on MLM, and then on another dataset). They found that intermediate pre-training further improves
+performance on SQA, achieving a new state-of-the-art as well as state-of-the-art on `TabFact
+<https://github.com/wenhuchen/Table-Fact-Checking>`__, a large-scale dataset with 16k Wikipedia tables for table
+entailment (a binary classification task). For more details, see their follow-up paper: `Understanding tables with
+intermediate pre-training <https://www.aclweb.org/anthology/2020.findings-emnlp.27/>`__ by Julian Martin Eisenschlos,
+Syrine Krichene and Thomas Müller.
+
+The original code can be found `here <https://github.com/google-research/tapas>`__.
+
+Tips:
+
+- TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell
+  of the table). Note that this is something that was added after the publication of the original TAPAS paper.
+  According to the authors, this usually results in a slightly better performance, and allows you to encode longer
+  sequences without running out of embeddings. This is reflected in the ``reset_position_index_per_cell`` parameter of
+  :class:`~transformers.TapasConfig`, which is set to ``True`` by default. The default versions of the models available
+  in the `model hub <https://huggingface.co/models?search=tapas>`_ all use relative position embeddings. You can still
+  use the ones with absolute position embeddings by passing in an additional argument ``revision="no_reset"`` when
+  calling the ``.from_pretrained()`` method. Note that it's usually advised to pad the inputs on the right rather than
+  the left.
+- TAPAS is based on BERT, so ``TAPAS-base`` for example corresponds to a ``BERT-base`` architecture. Of course,
+  TAPAS-large will result in the best performance (the results reported in the paper are from TAPAS-large). Results of
+  the various sized models are shown on the `original Github repository <https://github.com/google-research/tapas>`_.
+- TAPAS has checkpoints fine-tuned on SQA, which are capable of answering questions related to a table in a
+  conversational set-up. This means that you can ask follow-up questions such as "what is his age?" related to the
+  previous question. Note that the forward pass of TAPAS is a bit different in case of a conversational set-up: in that
+  case, you have to feed every table-question pair one by one to the model, such that the `prev_labels` token type ids
+  can be overwritten by the predicted `labels` of the model to the previous question. See "Usage" section for more
+  info.
+- TAPAS is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+
+
+Usage: fine-tuning
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here we explain how you can fine-tune :class:`~transformers.TapasForQuestionAnswering` on your own dataset.
+
+**STEP 1: Choose one of the 3 ways in which you can use TAPAS - or experiment**
+
+Basically, there are 3 different ways in which one can fine-tune :class:`~transformers.TapasForQuestionAnswering`,
+corresponding to the different datasets on which Tapas was fine-tuned:
+
+1. SQA: if you're interested in asking follow-up questions related to a table, in a conversational set-up. For example
+   if you first ask "what's the name of the first actor?" then you can ask a follow-up question such as "how old is
+   he?". Here, questions do not involve any aggregation (all questions are cell selection questions).
+2. WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions
+   related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or
+   averaging cell values. You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his
+   career?". This case is also called **weak supervision**, since the model itself must learn the appropriate
+   aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision.
+3. WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation
+   operator during training. This is also called **strong supervision**. Here, learning the appropriate aggregation
+   operator is much easier.
+
+To summarize:
+
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| **Task**                           | **Example dataset**  | **Description**                                                                                                   |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Conversational                     | SQA                  | Conversational, only cell selection questions                                                                     |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Weak supervision for aggregation   | WTQ                  | Questions might involve aggregation, and the model must learn this given only the answer as supervision           |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Strong supervision for aggregation | WikiSQL-supervised   | Questions might involve aggregation, and the model must learn this given the gold aggregation operator            |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+
+Initializing a model with a pre-trained base and randomly initialized classification heads from the model hub can be
+done as follows (be sure to have installed the `torch-scatter dependency <https://github.com/rusty1s/pytorch_scatter>`_
+for your environment):
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+        >>> # for example, the base sized model with default SQA configuration
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base')
+
+        >>> # or, the base sized model with WTQ configuration
+        >>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+        >>> # or, the base sized model with WikiSQL configuration
+        >>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+
+Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also
+experiment by defining any hyperparameters you want when initializing :class:`~transformers.TapasConfig`, and then
+create a :class:`~transformers.TapasForQuestionAnswering` based on that configuration. For example, if you have a
+dataset that has both conversational questions and questions that might involve aggregation, then you can do it this
+way. Here's an example:
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+        >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
+        >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
+        >>> # initializing the pre-trained base sized model with our custom classification heads
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned
+checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See `here
+<https://github.com/google-research/tapas/issues/91#issuecomment-735719340>`__ for more info.
+
+For a list of all pre-trained and fine-tuned TAPAS checkpoints available in the HuggingFace model hub, see `here
+<https://huggingface.co/models?search=tapas>`__.
+
+**STEP 2: Prepare your data in the SQA format**
+
+Second, no matter what you picked above, you should prepare your dataset in the `SQA format
+<https://www.microsoft.com/en-us/download/details.aspx?id=54253>`__. This format is a TSV/CSV file with the following
+columns:
+
+- ``id``: optional, id of the table-question pair, for bookkeeping purposes.
+- ``annotator``: optional, id of the person who annotated the table-question pair, for bookkeeping purposes.
+- ``position``: integer indicating if the question is the first, second, third,... related to the table. Only required
+  in case of conversational setup (SQA). You don't need this column in case you're going for WTQ/WikiSQL-supervised.
+- ``question``: string
+- ``table_file``: string, name of a csv file containing the tabular data
+- ``answer_coordinates``: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is
+  part of the answer)
+- ``answer_text``: list of one or more strings (each string being a cell value that is part of the answer)
+- ``aggregation_label``: index of the aggregation operator. Only required in case of strong supervision for aggregation
+  (the WikiSQL-supervised case)
+- ``float_answer``: the float answer to the question, if there is one (np.nan if there isn't). Only required in case of
+  weak supervision for aggregation (such as WTQ and WikiSQL)
+
+The tables themselves should be present in a folder, each table being a separate csv file. Note that the authors of the
+TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the
+SQA format. The author explains this `here
+<https://github.com/google-research/tapas/issues/50#issuecomment-705465960>`__. Interestingly, these conversion scripts
+are not perfect (the ``answer_coordinates`` and ``float_answer`` fields are populated based on the ``answer_text``),
+meaning that WTQ and WikiSQL results could actually be improved.
+
+**STEP 3: Convert your data into PyTorch tensors using TapasTokenizer**
+
+Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular
+data), you can then use :class:`~transformers.TapasTokenizer` to convert table-question pairs into :obj:`input_ids`,
+:obj:`attention_mask`, :obj:`token_type_ids` and so on. Again, based on which of the three cases you picked above,
+:class:`~transformers.TapasForQuestionAnswering` requires different inputs to be fine-tuned:
+
+------------------------------------+----------------------------------------------------------------------------------------------+
+| **Task**                           | **Required inputs**                                                                          |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Conversational                     | ``input_ids``, ``attention_mask``, ``token_type_ids``, ``labels``                            |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Weak supervision for aggregation   | ``input_ids``, ``attention_mask``, ``token_type_ids``, ``labels``, ``numeric_values``,       |
+|                                    | ``numeric_values_scale``, ``float_answer``                                                   |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Strong supervision for aggregation | ``input ids``, ``attention mask``, ``token type ids``, ``labels``, ``aggregation_labels``    |
+------------------------------------+----------------------------------------------------------------------------------------------+
+
+:class:`~transformers.TapasTokenizer` creates the ``labels``, ``numeric_values`` and ``numeric_values_scale`` based on
+the ``answer_coordinates`` and ``answer_text`` columns of the TSV file. The ``float_answer`` and ``aggregation_labels``
+are already in the TSV file of step 2. Here's an example:
+
+.. code-block::
+
+        >>> from transformers import TapasTokenizer
+        >>> import pandas as pd
+
+        >>> model_name = 'google/tapas-base'
+        >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+        >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+        >>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
+        >>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> inputs = tokenizer(table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding='max_length', return_tensors='pt')
+        >>> inputs
+        {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
+        'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
+
+Note that :class:`~transformers.TapasTokenizer` expects the data of the table to be **text-only**. You can use
+``.astype(str)`` on a dataframe to turn it into text-only data. Of course, this only shows how to encode a single
+training example. It is advised to create a PyTorch dataset and a corresponding dataloader:
+
+.. code-block::
+
+        >>> import torch
+        >>> import pandas as pd
+
+        >>> tsv_path = "your_path_to_the_tsv_file"
+        >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
+
+        >>> class TableDataset(torch.utils.data.Dataset):
+        ...     def __init__(self, data, tokenizer):
+        ...         self.data = data
+        ...         self.tokenizer = tokenizer
+        ...
+        ...     def __getitem__(self, idx):
+        ...         item = data.iloc[idx]
+        ...         table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
+        ...         encoding = self.tokenizer(table=table, 
+        ...                                   queries=item.question, 
+        ...                                   answer_coordinates=item.answer_coordinates, 
+        ...                                   answer_text=item.answer_text,
+        ...                                   truncation=True,
+        ...                                   padding="max_length",
+        ...                                   return_tensors="pt"
+        ...         )
+        ...         # remove the batch dimension which the tokenizer adds by default
+        ...         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
+        ...         # add the float_answer which is also required (weak supervision for aggregation case)
+        ...         encoding["float_answer"] = torch.tensor(item.float_answer) 
+        ...         return encoding
+        ...
+        ...     def __len__(self):
+        ...        return len(self.data)
+
+        >>> data = pd.read_csv(tsv_path, sep='\t')
+        >>> train_dataset = TableDataset(data, tokenizer)
+        >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
+
+Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not
+conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group
+together the ``queries``, ``answer_coordinates`` and ``answer_text`` per table (in the order of their ``position``
+index) and batch encode each table with its questions. This will make sure that the ``prev_labels`` token types (see
+docs of :class:`~transformers.TapasTokenizer`) are set correctly. See `this notebook
+<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__
+for more info.
+
+**STEP 4: Train (fine-tune) TapasForQuestionAnswering**
+
+You can then fine-tune :class:`~transformers.TapasForQuestionAnswering` using native PyTorch as follows (shown here for
+the weak supervision for aggregation case):
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering, AdamW
+
+        >>> # this is the default WTQ configuration
+        >>> config = TapasConfig(
+        ...            num_aggregation_labels = 4,
+        ...            use_answer_as_supervision = True,
+        ...            answer_loss_cutoff = 0.664694,
+        ...            cell_selection_preference = 0.207951,
+        ...            huber_loss_delta = 0.121194,
+        ...            init_cell_selection_weights_to_zero = True,
+        ...            select_one_column = True,
+        ...            allow_empty_column_selection = False,
+        ...            temperature = 0.0352513,
+        ... )
+        >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+        >>> optimizer = AdamW(model.parameters(), lr=5e-5)
+
+        >>> for epoch in range(2):  # loop over the dataset multiple times
+        ...    for idx, batch in enumerate(train_dataloader):
+        ...         # get the inputs; 
+        ...         input_ids = batch["input_ids"]
+        ...         attention_mask = batch["attention_mask"]
+        ...         token_type_ids = batch["token_type_ids"]
+        ...         labels = batch["labels"]
+        ...         numeric_values = batch["numeric_values"]
+        ...         numeric_values_scale = batch["numeric_values_scale"]
+        ...         float_answer = batch["float_answer"]
+
+        ...         # zero the parameter gradients
+        ...         optimizer.zero_grad()
+
+        ...         # forward + backward + optimize
+        ...         outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
+        ...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
+        ...                        float_answer=float_answer)
+        ...         loss = outputs.loss
+        ...         loss.backward()
+        ...         optimizer.step()
+
+Usage: inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here we explain how you can use :class:`~transformers.TapasForQuestionAnswering` for inference (i.e. making predictions
+on new data). For inference, only ``input_ids``, ``attention_mask`` and ``token_type_ids`` (which you can obtain using
+:class:`~transformers.TapasTokenizer`) have to be provided to the model to obtain the logits. Next, you can use the
+handy ``convert_logits_to_predictions`` method of :class:`~transformers.TapasTokenizer` to convert these into predicted
+coordinates and optional aggregation indices.
+
+However, note that inference is **different** depending on whether or not the setup is conversational. In a
+non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example
+of that:
+
+.. code-block::
+
+        >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
+        >>> import pandas as pd 
+
+        >>> model_name = 'google/tapas-base-finetuned-wtq'
+        >>> model = TapasForQuestionAnswering.from_pretrained(model_name)
+        >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+        >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") 
+        >>> outputs = model(**inputs)
+        >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+        ...         inputs, 
+        ...         outputs.logits.detach(), 
+        ...         outputs.logits_aggregation.detach()
+        ... )
+
+        >>> # let's print out the results:
+        >>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+        >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
+
+        >>> answers = []
+        >>> for coordinates in predicted_answer_coordinates:
+        ...   if len(coordinates) == 1:
+        ...     # only a single cell:
+        ...     answers.append(table.iat[coordinates[0]])
+        ...   else:
+        ...     # multiple cells
+        ...     cell_values = []
+        ...     for coordinate in coordinates:
+        ...        cell_values.append(table.iat[coordinate])
+        ...     answers.append(", ".join(cell_values))
+
+        >>> display(table)
+        >>> print("")
+        >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
+        ...   print(query)
+        ...   if predicted_agg == "NONE":
+        ...     print("Predicted answer: " + answer)
+        ...   else:
+        ...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+        What is the name of the first actor?
+        Predicted answer: Brad Pitt
+        How many movies has George Clooney played in?
+        Predicted answer: COUNT > 69
+        What is the total number of movies?
+        Predicted answer: SUM > 87, 53, 69
+
+In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such
+that the ``prev_labels`` token types can be overwritten by the predicted ``labels`` of the previous table-question
+pair. Again, more info can be found in `this notebook
+<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__.
+
+
+Tapas specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.tapas.modeling_tapas.TableQuestionAnsweringOutput
+    :members:
+
+
+TapasConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasConfig
+    :members:
+
+
+TapasTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasTokenizer
+    :members: __call__, convert_logits_to_predictions, save_vocabulary
+
+
+TapasModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasModel
+    :members: forward
+
+
+TapasForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForMaskedLM
+    :members: forward
+
+
+TapasForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForSequenceClassification
+    :members: forward
+
+
+TapasForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Transformer XL
 -----------------------------------------------------------------------------------------------------------------------

@ -76,6 +88,13 @@ TransfoXLLMHeadModel
    :members: forward


+TransfoXLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TransfoXLForSequenceClassification
+    :members: forward
+
+
 TFTransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -88,3 +107,18 @@ TFTransfoXLLMHeadModel

 .. autoclass:: transformers.TFTransfoXLLMHeadModel
    :members: call
+
+
+TFTransfoXLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTransfoXLForSequenceClassification
+    :members: call
+
+
+Internal Layers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AdaptiveEmbedding
+
+.. autoclass:: transformers.TFAdaptiveEmbedding
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@ -0,0 +1,65 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Wav2Vec2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Wav2Vec2 model was proposed in `wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
+<https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+
+The abstract from the paper is the following:
+
+*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on
+transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks
+the speech input in the latent space and solves a contrastive task defined over a quantization of the latent
+representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the
+clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state
+of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and
+pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech
+recognition with limited amounts of labeled data.*
+
+Tips:
+
+- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
+  using :class:`~transformers.Wav2Vec2Tokenizer`.
+
+
+Wav2Vec2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2Config
+    :members:
+
+
+Wav2Vec2Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2Tokenizer
+    :members: __call__, save_vocabulary
+
+
+Wav2Vec2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2Model
+    :members: forward
+
+
+Wav2Vec2ForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2ForCTC
+    :members: forward
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/xlmprophetnet.rst
+++ b/docs/source/model_doc/xlmprophetnet.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM-ProphetNet
 -----------------------------------------------------------------------------------------------------------------------

@ -19,7 +31,7 @@ just the next token. Its architecture is identical to ProhpetNet, but the model

 The abstract from the paper is the following:

-*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
 self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
 the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
 n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
@ -27,7 +39,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
 overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
 dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
 abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*

 The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.

--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM-RoBERTa
 -----------------------------------------------------------------------------------------------------------------------

@ -50,6 +62,13 @@ XLMRobertaTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+XLMRobertaTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaTokenizerFast
+    :members:
+
+
 XLMRobertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLNet
 -----------------------------------------------------------------------------------------------------------------------

@ -50,6 +62,13 @@ XLNetTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+XLNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetTokenizerFast
+    :members:
+
+
 XLNet specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Model sharing and uploading
 =======================================================================================================================

@ -48,7 +60,7 @@ Basic steps
 In order to upload a model, you'll need to first create a git repo. This repo will live on the model hub, allowing
 users to clone it and you (and your organization members) to push to it.

-You can create a model repo directly from the website, `here <https://huggingface.co/new>`.
+You can create a model repo directly from `the /new page on the website <https://huggingface.co/new>`__.

 Alternatively, you can use the ``transformers-cli``. The next steps describe that process:

@ -66,16 +78,22 @@ Once you are logged in with your model hub credentials, you can start building y

    transformers-cli repo create your-model-name

+If you want to create a repo under a specific organization, you should add a `--organization` flag:
+
+.. code-block:: bash
+
+    transformers-cli repo create your-model-name --organization your-org-name
+
 This creates a repo on the model hub, which can be cloned.

 .. code-block:: bash

-    git clone https://huggingface.co/username/your-model-name
-
    # Make sure you have git-lfs installed
    # (https://git-lfs.github.com/)
    git lfs install

+    git clone https://huggingface.co/username/your-model-name
+
 When you have your local clone of your repo and lfs installed, you can then add/remove from that clone as you would
 with any other git repo.

@ -86,8 +104,15 @@ with any other git repo.
    echo "hello" >> README.md
    git add . && git commit -m "Update from $USER"

-We are intentionally not wrapping git too much, so as to stay intuitive and easy-to-use.
+We are intentionally not wrapping git too much, so that you can go on with the workflow you're used to and the tools
+you already know.

+The only learning curve you might have compared to regular git is the one for git-lfs. The documentation at
+`git-lfs.github.com <https://git-lfs.github.com/>`__ is decent, but we'll work on a tutorial with some tips and tricks
+in the coming weeks!
+
+Additionally, if you want to change multiple repos at once, the `change_config.py script
+<https://github.com/huggingface/efficient_scripts/blob/main/change_config.py>`__ can probably save you some time.

 Make your model work on all frameworks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -98,7 +123,7 @@ Make your model work on all frameworks
 You probably have your favorite framework, but so will other users! That's why it's best to upload your model with both
 PyTorch `and` TensorFlow checkpoints to make it easier to use (if you skip this step, users will still be able to load
 your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's
-super easy to do (and in a future version, it will all be automatic). You will need to install both PyTorch and
+super easy to do (and in a future version, it might all be automatic). You will need to install both PyTorch and
 TensorFlow for this step, but you don't need to worry about the GPU, so it should be very easy. Check the `TensorFlow
 installation page <https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available>`__ and/or the `PyTorch
 installation page <https://pytorch.org/get-started/locally/#start-locally>`__ to see how.
@ -180,7 +205,7 @@ status`` command:
    git add --all
    git status

-Finally, the files should be comitted:
+Finally, the files should be committed:

 .. code-block:: bash

@ -198,23 +223,20 @@ This will upload the folder containing the weights, tokenizer and configuration
 Add a model card
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-To make sure everyone knows what your model can do, what its limitations and potential bias or ethetical
-considerations, please add a README.md model card to the 🤗 Transformers repo under `model_cards/`. It should then be
-placed in a subfolder with your username or organization, then another subfolder named like your model
-(`awesome-name-you-picked`). Or just click on the "Create a model card on GitHub" button on the model page, it will get
-you directly to the right location. If you need one, `here <https://github.com/huggingface/model_card>`__ is a model
-card template (meta-suggestions are welcome).
+To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
+please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
+titled "Add a README.md" on your model page. A model card template can be found `here
+<https://github.com/huggingface/model_card>`__ (meta-suggestions are welcome). model card template (meta-suggestions
+are welcome).
+
+.. note::
+
+    Model cards used to live in the 🤗 Transformers repo under `model_cards/`, but for consistency and scalability we
+    migrated every model card from the repo to its corresponding huggingface.co model repo.

 If your model is fine-tuned from another model coming from the model hub (all 🤗 Transformers pretrained models do),
 don't forget to link to its model card so that people can fully trace how your model was built.

-If you have never made a pull request to the 🤗 Transformers repo, look at the :doc:`contributing guide <contributing>`
-to see the steps to follow.
-
-.. note::
-
-    You can also send your model card in the folder you uploaded with the CLI by placing it in a `README.md` file
-    inside `path/to/awesome-name-you-picked/`.

 Using your model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -250,7 +272,8 @@ First you need to install `git-lfs` in the environment used by the notebook:

    sudo apt-get install git-lfs

-Then you can use the :obj:`transformers-cli` to create your new repo:
+Then you can use either create a repo directly from `huggingface.co <https://huggingface.co/>`__ , or use the
+:obj:`transformers-cli` to create it:


 .. code-block:: bash
@ -262,13 +285,14 @@ Once it's created, you can clone it and configure it (replace username by your u

 .. code-block:: bash

+    git lfs install
+
    git clone https://username:password@huggingface.co/username/your-model-name
    # Alternatively if you have a token,
    # you can use it instead of your password
    git clone https://username:token@huggingface.co/username/your-model-name

    cd your-model-name
-    git lfs install
    git config --global user.email "email@example.com"
    # Tip: using the same email than for your huggingface.co account will link your commits to your profile
    git config --global user.name "Your name"
--- a/docs/source/model_summary.rst
+++ b/docs/source/model_summary.rst
@ -1,10 +1,22 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Summary of the models
 =======================================================================================================================

 This is a summary of the models available in 🤗 Transformers. It assumes you’re familiar with the original `transformer
 model <https://arxiv.org/abs/1706.03762>`_. For a gentle introduction check the `annotated transformer
 <http://nlp.seas.harvard.edu/2018/04/03/attention.html>`_. Here we focus on the high-level differences between the
-models. You can check them more in detail in their respective documentation. Also checkout the :doc:`pretrained model
+models. You can check them more in detail in their respective documentation. Also check out the :doc:`pretrained model
 page </pretrained_models>` to see the checkpoints available for each type of model and all `the community models
 <https://huggingface.co/models>`_.

@ -18,7 +30,7 @@ Each one of the models in the library falls into one of the following categories

 Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the
 previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full
-sentence so that the attention heads can only see what was before in the next, and not what’s after. Although those
+sentence so that the attention heads can only see what was before in the text, and not what’s after. Although those
 models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation. A
 typical example of such models is GPT.

@ -318,6 +330,36 @@ the same probabilities as the larger model. The actual objective is a combinatio
 The library provides a version of the model for masked language modeling, token classification, sentence classification
 and question answering.

+ConvBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=convbert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
+   </a>
+   <a href="model_doc/convbert.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-convbert-blueviolet">
+   </a>
+
+`ConvBERT: Improving BERT with Span-based Dynamic Convolution <https://arxiv.org/abs/1910.01108>`_, Zihang Jiang,
+Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+
+Pre-trained language models like BERT and its variants have recently achieved impressive performance in various natural
+language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers large
+memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
+generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
+which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
+replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
+rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
+learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
+ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
+fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
+using less than 1/4 training cost.
+
+The library provides a version of the model for masked language modeling, token classification, sentence classification
+and question answering.
+
 XLM
 -----------------------------------------------------------------------------------------------------------------------

@ -500,8 +542,8 @@ BART
 <https://arxiv.org/abs/1910.13461>`_, Mike Lewis et al.

 Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is
-fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). For the encoder
-, on the pretraining tasks, a composition of the following transformations are applied:
+fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of
+the following transformations are applied on the pretraining tasks for the encoder:

  * mask random tokens (like in BERT)
  * delete random tokens
@ -527,10 +569,10 @@ Pegasus
 <https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.

 Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
-two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pre-training
+two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
 objective, called Gap Sentence Generation (GSG).

-  * MLM: encoder input tokens are randomely replaced by a mask tokens and have to be predicted by the encoder (like in
+  * MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
    BERT)
  * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
    causal mask to hide the future words like a regular auto-regressive transformer decoder.
@ -609,7 +651,7 @@ MT5
 `mT5: A massively multilingual pre-trained text-to-text transformer <https://arxiv.org/abs/2010.11934>`_, Linting Xue
 et al.

-The model architecture is same as T5. mT5's pre-training objective includes T5's self-supervised training, but not T5's
+The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
 supervised training. mT5 is trained on 101 languages.

 The library provides a version of this model for conditional generation.
@ -630,8 +672,8 @@ MBart
 `Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu,
 Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.

-The model architecture and pre-training objective is same as BART, but MBart is trained on 25 languages and is intended
-for supervised and unsupervised machine translation. MBart is one of the first methods for pre-training a complete
+The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
+for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
 sequence-to-sequence model by denoising full texts in multiple languages,

 The library provides a version of this model for conditional generation.
@ -658,7 +700,7 @@ ProphetNet
 `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.

-ProphetNet introduces a novel *sequence-to-sequence* pre-training objective, called *future n-gram prediction*. In
+ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
 future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
 time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
 to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
@ -683,8 +725,8 @@ XLM-ProphetNet
 `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.

-XLM-ProphetNet's model architecture and pre-training objective is same as ProphetNet, but XLM-ProphetNet was
-pre-trained on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
+XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
+on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.

 The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
 versions for headline generation and question generation, respectively.
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`$PYTHON setup.py install # Python command to install the script.`